diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml index 9863ff087ca86..c375fa5dc7516 100644 --- a/.github/new-prs-labeler.yml +++ b/.github/new-prs-labeler.yml @@ -499,6 +499,7 @@ clang:static analyzer: - clang/tools/scan-build/** - clang/utils/analyzer/** - clang/docs/analyzer/** + - clang/test/Analysis/** pgo: - llvm/lib/Transforms/Instrumentation/CGProfile.cpp diff --git a/clang-tools-extra/clangd/CollectMacros.cpp b/clang-tools-extra/clangd/CollectMacros.cpp index 96298ee3ea50a..1e7d765f0b6f1 100644 --- a/clang-tools-extra/clangd/CollectMacros.cpp +++ b/clang-tools-extra/clangd/CollectMacros.cpp @@ -18,10 +18,13 @@ namespace clang { namespace clangd { -Range MacroOccurrence::toRange(const SourceManager &SM) const { +CharSourceRange MacroOccurrence::toSourceRange(const SourceManager &SM) const { auto MainFile = SM.getMainFileID(); - return halfOpenToRange( - SM, syntax::FileRange(MainFile, StartOffset, EndOffset).toCharRange(SM)); + return syntax::FileRange(MainFile, StartOffset, EndOffset).toCharRange(SM); +} + +Range MacroOccurrence::toRange(const SourceManager &SM) const { + return halfOpenToRange(SM, toSourceRange(SM)); } void CollectMainFileMacros::add(const Token &MacroNameTok, const MacroInfo *MI, diff --git a/clang-tools-extra/clangd/CollectMacros.h b/clang-tools-extra/clangd/CollectMacros.h index e7198641d8d53..20a3fc24d759c 100644 --- a/clang-tools-extra/clangd/CollectMacros.h +++ b/clang-tools-extra/clangd/CollectMacros.h @@ -31,6 +31,7 @@ struct MacroOccurrence { // True if the occurence is used in a conditional directive, e.g. #ifdef MACRO bool InConditionalDirective; + CharSourceRange toSourceRange(const SourceManager &SM) const; Range toRange(const SourceManager &SM) const; }; diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp index 1de7faf81746e..3f5633357073d 100644 --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -713,7 +713,8 @@ void SymbolCollector::handleMacros(const MainFileMacros &MacroRefsToIndex) { // Add macro references. for (const auto &IDToRefs : MacroRefsToIndex.MacroRefs) { for (const auto &MacroRef : IDToRefs.second) { - const auto &Range = MacroRef.toRange(SM); + const auto &SR = MacroRef.toSourceRange(SM); + auto Range = halfOpenToRange(SM, SR); bool IsDefinition = MacroRef.IsDefinition; Ref R; R.Location.Start.setLine(Range.start.line); @@ -726,9 +727,7 @@ void SymbolCollector::handleMacros(const MainFileMacros &MacroRefsToIndex) { if (IsDefinition) { Symbol S; S.ID = IDToRefs.first; - auto StartLoc = cantFail(sourceLocationInMainFile(SM, Range.start)); - auto EndLoc = cantFail(sourceLocationInMainFile(SM, Range.end)); - S.Name = toSourceCode(SM, SourceRange(StartLoc, EndLoc)); + S.Name = toSourceCode(SM, SR.getAsRange()); S.SymInfo.Kind = index::SymbolKind::Macro; S.SymInfo.SubKind = index::SymbolSubKind::None; S.SymInfo.Properties = index::SymbolPropertySet(); diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index a91c764860ccd..5780f5d61d579 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -170,6 +170,7 @@ Bug Fixes to C++ Support - Clang is now better at keeping track of friend function template instance contexts. (#GH55509) - The initialization kind of elements of structured bindings direct-list-initialized from an array is corrected to direct-initialization. +- Clang no longer crashes when a coroutine is declared ``[[noreturn]]``. (#GH127327) Bug Fixes to AST Handling ^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h index ff4f236c1fa88..0f98d237dcbcd 100644 --- a/clang/include/clang/AST/Expr.h +++ b/clang/include/clang/AST/Expr.h @@ -4579,25 +4579,97 @@ class ShuffleVectorExpr : public Expr { /// ConvertVectorExpr - Clang builtin function __builtin_convertvector /// This AST node provides support for converting a vector type to another /// vector type of the same arity. -class ConvertVectorExpr : public Expr { +class ConvertVectorExpr final + : public Expr, + private llvm::TrailingObjects { private: Stmt *SrcExpr; TypeSourceInfo *TInfo; SourceLocation BuiltinLoc, RParenLoc; + friend TrailingObjects; friend class ASTReader; friend class ASTStmtReader; - explicit ConvertVectorExpr(EmptyShell Empty) : Expr(ConvertVectorExprClass, Empty) {} + explicit ConvertVectorExpr(bool HasFPFeatures, EmptyShell Empty) + : Expr(ConvertVectorExprClass, Empty) { + ConvertVectorExprBits.HasFPFeatures = HasFPFeatures; + } -public: ConvertVectorExpr(Expr *SrcExpr, TypeSourceInfo *TI, QualType DstType, ExprValueKind VK, ExprObjectKind OK, - SourceLocation BuiltinLoc, SourceLocation RParenLoc) + SourceLocation BuiltinLoc, SourceLocation RParenLoc, + FPOptionsOverride FPFeatures) : Expr(ConvertVectorExprClass, DstType, VK, OK), SrcExpr(SrcExpr), TInfo(TI), BuiltinLoc(BuiltinLoc), RParenLoc(RParenLoc) { + ConvertVectorExprBits.HasFPFeatures = FPFeatures.requiresTrailingStorage(); + if (hasStoredFPFeatures()) + setStoredFPFeatures(FPFeatures); setDependence(computeDependence(this)); } + size_t numTrailingObjects(OverloadToken) const { + return ConvertVectorExprBits.HasFPFeatures ? 1 : 0; + } + + FPOptionsOverride &getTrailingFPFeatures() { + assert(ConvertVectorExprBits.HasFPFeatures); + return *getTrailingObjects(); + } + + const FPOptionsOverride &getTrailingFPFeatures() const { + assert(ConvertVectorExprBits.HasFPFeatures); + return *getTrailingObjects(); + } + +public: + static ConvertVectorExpr *CreateEmpty(const ASTContext &C, + bool hasFPFeatures); + + static ConvertVectorExpr *Create(const ASTContext &C, Expr *SrcExpr, + TypeSourceInfo *TI, QualType DstType, + ExprValueKind VK, ExprObjectKind OK, + SourceLocation BuiltinLoc, + SourceLocation RParenLoc, + FPOptionsOverride FPFeatures); + + /// Get the FP contractibility status of this operator. Only meaningful for + /// operations on floating point types. + bool isFPContractableWithinStatement(const LangOptions &LO) const { + return getFPFeaturesInEffect(LO).allowFPContractWithinStatement(); + } + + /// Is FPFeatures in Trailing Storage? + bool hasStoredFPFeatures() const { + return ConvertVectorExprBits.HasFPFeatures; + } + + /// Get FPFeatures from trailing storage. + FPOptionsOverride getStoredFPFeatures() const { + return getTrailingFPFeatures(); + } + + /// Get the store FPOptionsOverride or default if not stored. + FPOptionsOverride getStoredFPFeaturesOrDefault() const { + return hasStoredFPFeatures() ? getStoredFPFeatures() : FPOptionsOverride(); + } + + /// Set FPFeatures in trailing storage, used by Serialization & ASTImporter. + void setStoredFPFeatures(FPOptionsOverride F) { getTrailingFPFeatures() = F; } + + /// Get the FP features status of this operator. Only meaningful for + /// operations on floating point types. + FPOptions getFPFeaturesInEffect(const LangOptions &LO) const { + if (ConvertVectorExprBits.HasFPFeatures) + return getStoredFPFeatures().applyOverrides(LO); + return FPOptions::defaultWithoutTrailingStorage(LO); + } + + FPOptionsOverride getFPOptionsOverride() const { + if (ConvertVectorExprBits.HasFPFeatures) + return getStoredFPFeatures(); + return FPOptionsOverride(); + } + /// getSrcExpr - Return the Expr to be converted. Expr *getSrcExpr() const { return cast(SrcExpr); } diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h index 405c6166adb15..604ac51d478cf 100644 --- a/clang/include/clang/AST/Stmt.h +++ b/clang/include/clang/AST/Stmt.h @@ -1215,6 +1215,20 @@ class alignas(void *) Stmt { SourceLocation Loc; }; + class ConvertVectorExprBitfields { + friend class ConvertVectorExpr; + + LLVM_PREFERRED_TYPE(ExprBitfields) + unsigned : NumExprBits; + + // + /// This is only meaningful for operations on floating point + /// types when additional values need to be in trailing storage. + /// It is 0 otherwise. + LLVM_PREFERRED_TYPE(bool) + unsigned HasFPFeatures : 1; + }; + union { // Same order as in StmtNodes.td. // Statements @@ -1293,6 +1307,7 @@ class alignas(void *) Stmt { // Clang Extensions OpaqueValueExprBitfields OpaqueValueExprBits; + ConvertVectorExprBitfields ConvertVectorExprBits; }; public: diff --git a/clang/include/clang/AST/TextNodeDumper.h b/clang/include/clang/AST/TextNodeDumper.h index 4b5ad2b5fa74c..81844db2c77fa 100644 --- a/clang/include/clang/AST/TextNodeDumper.h +++ b/clang/include/clang/AST/TextNodeDumper.h @@ -425,6 +425,7 @@ class TextNodeDumper void VisitOpenACCAsteriskSizeExpr(const OpenACCAsteriskSizeExpr *S); void VisitEmbedExpr(const EmbedExpr *S); void VisitAtomicExpr(const AtomicExpr *AE); + void VisitConvertVectorExpr(const ConvertVectorExpr *S); }; } // namespace clang diff --git a/clang/include/clang/Basic/Cuda.h b/clang/include/clang/Basic/Cuda.h index f33ba46233a7a..793cab1f4e84a 100644 --- a/clang/include/clang/Basic/Cuda.h +++ b/clang/include/clang/Basic/Cuda.h @@ -106,8 +106,6 @@ enum class OffloadArch { GFX90a, GFX90c, GFX9_4_GENERIC, - GFX940, - GFX941, GFX942, GFX950, GFX10_1_GENERIC, diff --git a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td index 305a06427ed0e..73759cfa9c3c9 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIRDialect.td +++ b/clang/include/clang/CIR/Dialect/IR/CIRDialect.td @@ -28,6 +28,8 @@ def CIR_Dialect : Dialect { let useDefaultTypePrinterParser = 0; let extraClassDeclaration = [{ + static llvm::StringRef getTripleAttrName() { return "cir.triple"; } + void registerAttributes(); void registerTypes(); diff --git a/clang/include/clang/CIR/FrontendAction/CIRGenAction.h b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h index 5f9110bc83b89..99495f4718c5f 100644 --- a/clang/include/clang/CIR/FrontendAction/CIRGenAction.h +++ b/clang/include/clang/CIR/FrontendAction/CIRGenAction.h @@ -25,8 +25,11 @@ class CIRGenConsumer; class CIRGenAction : public clang::ASTFrontendAction { public: enum class OutputType { + EmitAssembly, EmitCIR, EmitLLVM, + EmitBC, + EmitObj, }; private: @@ -63,6 +66,27 @@ class EmitLLVMAction : public CIRGenAction { EmitLLVMAction(mlir::MLIRContext *MLIRCtx = nullptr); }; +class EmitBCAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitBCAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + +class EmitAssemblyAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitAssemblyAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + +class EmitObjAction : public CIRGenAction { + virtual void anchor(); + +public: + EmitObjAction(mlir::MLIRContext *MLIRCtx = nullptr); +}; + } // namespace cir #endif diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp index c27ebbf838ad1..43da76e14d0a3 100644 --- a/clang/lib/AST/ASTImporter.cpp +++ b/clang/lib/AST/ASTImporter.cpp @@ -7386,9 +7386,10 @@ ExpectedStmt ASTNodeImporter::VisitConvertVectorExpr(ConvertVectorExpr *E) { if (Err) return std::move(Err); - return new (Importer.getToContext()) - ConvertVectorExpr(ToSrcExpr, ToTSI, ToType, E->getValueKind(), - E->getObjectKind(), ToBuiltinLoc, ToRParenLoc); + return ConvertVectorExpr::Create( + Importer.getToContext(), ToSrcExpr, ToTSI, ToType, E->getValueKind(), + E->getObjectKind(), ToBuiltinLoc, ToRParenLoc, + E->getStoredFPFeaturesOrDefault()); } ExpectedStmt ASTNodeImporter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp index 503c58a67adeb..a35aa9471a73d 100644 --- a/clang/lib/AST/ByteCode/Compiler.cpp +++ b/clang/lib/AST/ByteCode/Compiler.cpp @@ -3397,7 +3397,8 @@ bool Compiler::VisitCXXNewExpr(const CXXNewExpr *E) { CtorFunc = getFunction(CE->getConstructor()); if (!CtorFunc) return false; - } + } else if (!DynamicInit) + DynamicInit = Init; LabelTy EndLabel = this->getLabel(); LabelTy StartLabel = this->getLabel(); diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h index ca74046038072..fa113aa0bb157 100644 --- a/clang/lib/AST/ByteCode/Interp.h +++ b/clang/lib/AST/ByteCode/Interp.h @@ -1132,13 +1132,14 @@ bool CMP3(InterpState &S, CodePtr OpPC, const ComparisonCategoryInfo *CmpInfo) { const Pointer &P = S.Stk.peek(); ComparisonCategoryResult CmpResult = LHS.compare(RHS); - if (CmpResult == ComparisonCategoryResult::Unordered) { - // This should only happen with pointers. - const SourceInfo &Loc = S.Current->getSource(OpPC); - S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified) - << LHS.toDiagnosticString(S.getASTContext()) - << RHS.toDiagnosticString(S.getASTContext()); - return false; + if constexpr (std::is_same_v) { + if (CmpResult == ComparisonCategoryResult::Unordered) { + const SourceInfo &Loc = S.Current->getSource(OpPC); + S.FFDiag(Loc, diag::note_constexpr_pointer_comparison_unspecified) + << LHS.toDiagnosticString(S.getASTContext()) + << RHS.toDiagnosticString(S.getASTContext()); + return false; + } } assert(CmpInfo); diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp index 7fb89bf5b499f..63caf04f7ef38 100644 --- a/clang/lib/AST/DeclTemplate.cpp +++ b/clang/lib/AST/DeclTemplate.cpp @@ -786,12 +786,16 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( QualType T, bool ParameterPack, TypeSourceInfo *TInfo) { AutoType *AT = C.getLangOpts().CPlusPlus20 ? T->getContainedAutoType() : nullptr; - return new (C, DC, - additionalSizeToAlloc, - Expr *>(0, - AT && AT->isConstrained() ? 1 : 0)) - NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, ParameterPack, - TInfo); + const bool HasConstraint = AT && AT->isConstrained(); + auto *NTTP = + new (C, DC, + additionalSizeToAlloc, Expr *>( + 0, HasConstraint ? 1 : 0)) + NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, + ParameterPack, TInfo); + if (HasConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( @@ -800,23 +804,30 @@ NonTypeTemplateParmDecl *NonTypeTemplateParmDecl::Create( QualType T, TypeSourceInfo *TInfo, ArrayRef ExpandedTypes, ArrayRef ExpandedTInfos) { AutoType *AT = TInfo->getType()->getContainedAutoType(); - return new (C, DC, - additionalSizeToAlloc, - Expr *>( - ExpandedTypes.size(), AT && AT->isConstrained() ? 1 : 0)) - NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, TInfo, - ExpandedTypes, ExpandedTInfos); + const bool HasConstraint = AT && AT->isConstrained(); + auto *NTTP = + new (C, DC, + additionalSizeToAlloc, Expr *>( + ExpandedTypes.size(), HasConstraint ? 1 : 0)) + NonTypeTemplateParmDecl(DC, StartLoc, IdLoc, D, P, Id, T, TInfo, + ExpandedTypes, ExpandedTInfos); + if (HasConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl * NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, bool HasTypeConstraint) { - return new (C, ID, additionalSizeToAlloc, - Expr *>(0, - HasTypeConstraint ? 1 : 0)) + auto *NTTP = + new (C, ID, + additionalSizeToAlloc, Expr *>( + 0, HasTypeConstraint ? 1 : 0)) NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(), 0, 0, nullptr, QualType(), false, nullptr); + if (HasTypeConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); + return NTTP; } NonTypeTemplateParmDecl * @@ -830,6 +841,8 @@ NonTypeTemplateParmDecl::CreateDeserialized(ASTContext &C, GlobalDeclID ID, NonTypeTemplateParmDecl(nullptr, SourceLocation(), SourceLocation(), 0, 0, nullptr, QualType(), nullptr, {}, {}); NTTP->NumExpandedTypes = NumExpandedTypes; + if (HasTypeConstraint) + NTTP->setPlaceholderTypeConstraint(nullptr); return NTTP; } diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp index 1f949d495f343..b747aa8df807d 100644 --- a/clang/lib/AST/Expr.cpp +++ b/clang/lib/AST/Expr.cpp @@ -3911,6 +3911,8 @@ FPOptions Expr::getFPFeaturesInEffect(const LangOptions &LO) const { return BO->getFPFeaturesInEffect(LO); if (auto Cast = dyn_cast(this)) return Cast->getFPFeaturesInEffect(LO); + if (auto ConvertVector = dyn_cast(this)) + return ConvertVector->getFPFeaturesInEffect(LO); return FPOptions::defaultWithoutTrailingStorage(LO); } @@ -5451,3 +5453,21 @@ OpenACCAsteriskSizeExpr * OpenACCAsteriskSizeExpr::CreateEmpty(const ASTContext &C) { return new (C) OpenACCAsteriskSizeExpr({}, C.IntTy); } + +ConvertVectorExpr *ConvertVectorExpr::CreateEmpty(const ASTContext &C, + bool hasFPFeatures) { + void *Mem = C.Allocate(totalSizeToAlloc(hasFPFeatures), + alignof(ConvertVectorExpr)); + return new (Mem) ConvertVectorExpr(hasFPFeatures, EmptyShell()); +} + +ConvertVectorExpr *ConvertVectorExpr::Create( + const ASTContext &C, Expr *SrcExpr, TypeSourceInfo *TI, QualType DstType, + ExprValueKind VK, ExprObjectKind OK, SourceLocation BuiltinLoc, + SourceLocation RParenLoc, FPOptionsOverride FPFeatures) { + bool HasFPFeatures = FPFeatures.requiresTrailingStorage(); + unsigned Size = totalSizeToAlloc(HasFPFeatures); + void *Mem = C.Allocate(Size, alignof(ConvertVectorExpr)); + return new (Mem) ConvertVectorExpr(SrcExpr, TI, DstType, VK, OK, BuiltinLoc, + RParenLoc, FPFeatures); +} diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp index 26493caa5d06a..fd1eaab9621dd 100644 --- a/clang/lib/AST/TextNodeDumper.cpp +++ b/clang/lib/AST/TextNodeDumper.cpp @@ -3069,3 +3069,9 @@ void TextNodeDumper::VisitEmbedExpr(const EmbedExpr *S) { void TextNodeDumper::VisitAtomicExpr(const AtomicExpr *AE) { OS << ' ' << AE->getOpAsString(); } + +void TextNodeDumper::VisitConvertVectorExpr(const ConvertVectorExpr *S) { + VisitStmt(S); + if (S->hasStoredFPFeatures()) + printFPOptions(S->getStoredFPFeatures()); +} diff --git a/clang/lib/Basic/Cuda.cpp b/clang/lib/Basic/Cuda.cpp index 1bfec0b37c5ee..f45fb0eca3714 100644 --- a/clang/lib/Basic/Cuda.cpp +++ b/clang/lib/Basic/Cuda.cpp @@ -124,8 +124,6 @@ static const OffloadArchToStringMap arch_names[] = { GFX(90a), // gfx90a GFX(90c), // gfx90c {OffloadArch::GFX9_4_GENERIC, "gfx9-4-generic", "compute_amdgcn"}, - GFX(940), // gfx940 - GFX(941), // gfx941 GFX(942), // gfx942 GFX(950), // gfx950 {OffloadArch::GFX10_1_GENERIC, "gfx10-1-generic", "compute_amdgcn"}, diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp index 7d13c1f145440..547cf3dfa2be7 100644 --- a/clang/lib/Basic/Targets/NVPTX.cpp +++ b/clang/lib/Basic/Targets/NVPTX.cpp @@ -211,8 +211,6 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts, case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 2615ae382cb8b..cbecdf925aa5d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -52,6 +52,9 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, DoubleTy = cir::DoubleType::get(&getMLIRContext()); FP80Ty = cir::FP80Type::get(&getMLIRContext()); FP128Ty = cir::FP128Type::get(&getMLIRContext()); + + theModule->setAttr(cir::CIRDialect::getTripleAttrName(), + builder.getStringAttr(getTriple().str())); } mlir::Location CIRGenModule::getLoc(SourceLocation cLoc) { diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 1c7ed63773900..29bb4036218e4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -21,7 +21,9 @@ #include "mlir/IR/BuiltinOps.h" #include "mlir/IR/MLIRContext.h" #include "clang/Basic/SourceManager.h" +#include "clang/Basic/TargetInfo.h" #include "llvm/ADT/StringRef.h" +#include "llvm/TargetParser/Triple.h" namespace clang { class ASTContext; @@ -88,6 +90,8 @@ class CIRGenModule : public CIRGenTypeCache { void emitGlobalVarDefinition(const clang::VarDecl *vd, bool isTentative = false); + const llvm::Triple &getTriple() const { return target.getTriple(); } + /// Helpers to emit "not yet implemented" error diagnostics DiagnosticBuilder errorNYI(SourceLocation, llvm::StringRef); diff --git a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp index eab6958ac8f6d..0f686a36b982b 100644 --- a/clang/lib/CIR/FrontendAction/CIRGenAction.cpp +++ b/clang/lib/CIR/FrontendAction/CIRGenAction.cpp @@ -27,8 +27,14 @@ getBackendActionFromOutputType(CIRGenAction::OutputType Action) { assert(false && "Unsupported output type for getBackendActionFromOutputType!"); break; // Unreachable, but fall through to report that + case CIRGenAction::OutputType::EmitAssembly: + return BackendAction::Backend_EmitAssembly; + case CIRGenAction::OutputType::EmitBC: + return BackendAction::Backend_EmitBC; case CIRGenAction::OutputType::EmitLLVM: return BackendAction::Backend_EmitLL; + case CIRGenAction::OutputType::EmitObj: + return BackendAction::Backend_EmitObj; } // We should only get here if a non-enum value is passed in or we went through // the assert(false) case above @@ -84,7 +90,10 @@ class CIRGenConsumer : public clang::ASTConsumer { MlirModule->print(*OutputStream, Flags); } break; - case CIRGenAction::OutputType::EmitLLVM: { + case CIRGenAction::OutputType::EmitLLVM: + case CIRGenAction::OutputType::EmitBC: + case CIRGenAction::OutputType::EmitObj: + case CIRGenAction::OutputType::EmitAssembly: { llvm::LLVMContext LLVMCtx; std::unique_ptr LLVMModule = lowerFromCIRToLLVMIR(MlirModule, LLVMCtx); @@ -111,10 +120,16 @@ static std::unique_ptr getOutputStream(CompilerInstance &CI, StringRef InFile, CIRGenAction::OutputType Action) { switch (Action) { + case CIRGenAction::OutputType::EmitAssembly: + return CI.createDefaultOutputFile(false, InFile, "s"); case CIRGenAction::OutputType::EmitCIR: return CI.createDefaultOutputFile(false, InFile, "cir"); case CIRGenAction::OutputType::EmitLLVM: return CI.createDefaultOutputFile(false, InFile, "ll"); + case CIRGenAction::OutputType::EmitBC: + return CI.createDefaultOutputFile(true, InFile, "bc"); + case CIRGenAction::OutputType::EmitObj: + return CI.createDefaultOutputFile(true, InFile, "o"); } llvm_unreachable("Invalid CIRGenAction::OutputType"); } @@ -132,6 +147,10 @@ CIRGenAction::CreateASTConsumer(CompilerInstance &CI, StringRef InFile) { return Result; } +void EmitAssemblyAction::anchor() {} +EmitAssemblyAction::EmitAssemblyAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitAssembly, MLIRCtx) {} + void EmitCIRAction::anchor() {} EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx) : CIRGenAction(OutputType::EmitCIR, MLIRCtx) {} @@ -139,3 +158,11 @@ EmitCIRAction::EmitCIRAction(mlir::MLIRContext *MLIRCtx) void EmitLLVMAction::anchor() {} EmitLLVMAction::EmitLLVMAction(mlir::MLIRContext *MLIRCtx) : CIRGenAction(OutputType::EmitLLVM, MLIRCtx) {} + +void EmitBCAction::anchor() {} +EmitBCAction::EmitBCAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitBC, MLIRCtx) {} + +void EmitObjAction::anchor() {} +EmitObjAction::EmitObjAction(mlir::MLIRContext *MLIRCtx) + : CIRGenAction(OutputType::EmitObj, MLIRCtx) {} diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 74ff89346f3c4..235b5a057852a 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -114,6 +114,8 @@ struct ConvertCIRToLLVMPass } void runOnOperation() final; + void processCIRAttrs(mlir::ModuleOp module); + StringRef getDescription() const override { return "Convert the prepared CIR dialect module to LLVM dialect"; } @@ -271,6 +273,13 @@ static void prepareTypeConverter(mlir::LLVMTypeConverter &converter, }); } +void ConvertCIRToLLVMPass::processCIRAttrs(mlir::ModuleOp module) { + // Lower the module attributes to LLVM equivalents. + if (auto tripleAttr = module->getAttr(cir::CIRDialect::getTripleAttrName())) + module->setAttr(mlir::LLVM::LLVMDialect::getTargetTripleAttrName(), + tripleAttr); +} + void ConvertCIRToLLVMPass::runOnOperation() { llvm::TimeTraceScope scope("Convert CIR to LLVM Pass"); @@ -283,6 +292,8 @@ void ConvertCIRToLLVMPass::runOnOperation() { patterns.add(converter, patterns.getContext(), dl); + processCIRAttrs(module); + mlir::ConversionTarget target(getContext()); target.addLegalOp(); target.addLegalDialect(); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 348cb523b1718..4688381040be2 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -859,6 +859,24 @@ static void emitSincosBuiltin(CodeGenFunction &CGF, const CallExpr *E, StoreCos->setMetadata(LLVMContext::MD_noalias, AliasScopeList); } +static llvm::Value *emitModfBuiltin(CodeGenFunction &CGF, const CallExpr *E, + llvm::Intrinsic::ID IntrinsicID) { + llvm::Value *Val = CGF.EmitScalarExpr(E->getArg(0)); + llvm::Value *IntPartDest = CGF.EmitScalarExpr(E->getArg(1)); + + llvm::Value *Call = + CGF.Builder.CreateIntrinsic(IntrinsicID, {Val->getType()}, Val); + + llvm::Value *FractionalResult = CGF.Builder.CreateExtractValue(Call, 0); + llvm::Value *IntegralResult = CGF.Builder.CreateExtractValue(Call, 1); + + QualType DestPtrType = E->getArg(1)->getType()->getPointeeType(); + LValue IntegralLV = CGF.MakeNaturalAlignAddrLValue(IntPartDest, DestPtrType); + CGF.EmitStoreOfScalar(IntegralResult, IntegralLV); + + return FractionalResult; +} + /// EmitFAbs - Emit a call to @llvm.fabs(). static Value *EmitFAbs(CodeGenFunction &CGF, Value *V) { Function *F = CGF.CGM.getIntrinsic(Intrinsic::fabs, V->getType()); @@ -3377,11 +3395,16 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, return RValue::get(emitUnaryMaybeConstrainedFPBuiltin( *this, E, Intrinsic::sinh, Intrinsic::experimental_constrained_sinh)); + case Builtin::BIsincos: + case Builtin::BIsincosf: + case Builtin::BIsincosl: case Builtin::BI__builtin_sincos: case Builtin::BI__builtin_sincosf: case Builtin::BI__builtin_sincosf16: case Builtin::BI__builtin_sincosl: case Builtin::BI__builtin_sincosf128: + if (Builder.getIsFPConstrained()) + break; // TODO: Emit constrained sincos intrinsic once one exists. emitSincosBuiltin(*this, E, Intrinsic::sincos); return RValue::get(nullptr); @@ -4107,6 +4130,15 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, case Builtin::BI__builtin_frexpf128: case Builtin::BI__builtin_frexpf16: return RValue::get(emitFrexpBuiltin(*this, E, Intrinsic::frexp)); + case Builtin::BImodf: + case Builtin::BImodff: + case Builtin::BImodfl: + case Builtin::BI__builtin_modf: + case Builtin::BI__builtin_modff: + case Builtin::BI__builtin_modfl: + if (Builder.getIsFPConstrained()) + break; // TODO: Emit constrained modf intrinsic once one exists. + return RValue::get(emitModfBuiltin(*this, E, Intrinsic::modf)); case Builtin::BI__builtin_isgreater: case Builtin::BI__builtin_isgreaterequal: case Builtin::BI__builtin_isless: diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp index 30f01496ba221..5ee8a1bfa8175 100644 --- a/clang/lib/CodeGen/CGExprScalar.cpp +++ b/clang/lib/CodeGen/CGExprScalar.cpp @@ -1949,6 +1949,7 @@ Value *ScalarExprEmitter::VisitConvertVectorExpr(ConvertVectorExpr *E) { llvm::Value *Zero = llvm::Constant::getNullValue(SrcTy); if (SrcEltTy->isFloatingPointTy()) { + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); return Builder.CreateFCmpUNE(Src, Zero, "tobool"); } else { return Builder.CreateICmpNE(Src, Zero, "tobool"); @@ -1975,6 +1976,7 @@ Value *ScalarExprEmitter::VisitConvertVectorExpr(ConvertVectorExpr *E) { } else { assert(SrcEltTy->isFloatingPointTy() && DstEltTy->isFloatingPointTy() && "Unknown real conversion"); + CodeGenFunction::CGFPOptionsRAII FPOptions(CGF, E); if (DstEltTy->getTypeID() < SrcEltTy->getTypeID()) Res = Builder.CreateFPTrunc(Src, DstTy, "conv"); else diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp index c13928f61a748..826ec4da8ea28 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntimeGPU.cpp @@ -2302,8 +2302,6 @@ void CGOpenMPRuntimeGPU::processRequiresDirective(const OMPRequiresDecl *D) { case OffloadArch::GFX90a: case OffloadArch::GFX90c: case OffloadArch::GFX9_4_GENERIC: - case OffloadArch::GFX940: - case OffloadArch::GFX941: case OffloadArch::GFX942: case OffloadArch::GFX950: case OffloadArch::GFX10_1_GENERIC: diff --git a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp index c8d004163b96d..bb3bb0aac78bf 100644 --- a/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp +++ b/clang/lib/FrontendTool/ExecuteCompilerInvocation.cpp @@ -62,8 +62,18 @@ CreateFrontendBaseAction(CompilerInstance &CI) { return std::make_unique(); case DumpRawTokens: return std::make_unique(); case DumpTokens: return std::make_unique(); - case EmitAssembly: return std::make_unique(); - case EmitBC: return std::make_unique(); + case EmitAssembly: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); + case EmitBC: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); case EmitCIR: #if CLANG_ENABLE_CIR return std::make_unique(); @@ -80,7 +90,12 @@ CreateFrontendBaseAction(CompilerInstance &CI) { } case EmitLLVMOnly: return std::make_unique(); case EmitCodeGenOnly: return std::make_unique(); - case EmitObj: return std::make_unique(); + case EmitObj: +#if CLANG_ENABLE_CIR + if (UseCIR) + return std::make_unique(); +#endif + return std::make_unique(); case ExtractAPI: return std::make_unique(); case FixIt: return std::make_unique(); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index aae61f612a4bc..74f425d32648f 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5262,8 +5262,8 @@ ExprResult Sema::ConvertVectorExpr(Expr *E, TypeSourceInfo *TInfo, << E->getSourceRange()); } - return new (Context) class ConvertVectorExpr(E, TInfo, DstTy, VK, OK, - BuiltinLoc, RParenLoc); + return ConvertVectorExpr::Create(Context, E, TInfo, DstTy, VK, OK, BuiltinLoc, + RParenLoc, CurFPFeatureOverrides()); } bool Sema::BuiltinPrefetch(CallExpr *TheCall) { diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h index 05cac8db3c42c..eaabfae2409f4 100644 --- a/clang/lib/Sema/TreeTransform.h +++ b/clang/lib/Sema/TreeTransform.h @@ -12716,7 +12716,7 @@ TreeTransform::TransformDeclRefExpr(DeclRefExpr *E) { ValueDecl *ND = cast_or_null(getDerived().TransformDecl(E->getLocation(), E->getDecl())); - if (!ND) + if (!ND || ND->isInvalidDecl()) return ExprError(); NamedDecl *Found = ND; diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp index fba54023a6bb2..835ad4a658944 100644 --- a/clang/lib/Serialization/ASTReaderStmt.cpp +++ b/clang/lib/Serialization/ASTReaderStmt.cpp @@ -1387,10 +1387,15 @@ void ASTStmtReader::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { void ASTStmtReader::VisitConvertVectorExpr(ConvertVectorExpr *E) { VisitExpr(E); + bool HasFPFeatures = CurrentUnpackingBits->getNextBit(); + assert(HasFPFeatures == E->hasStoredFPFeatures()); E->BuiltinLoc = readSourceLocation(); E->RParenLoc = readSourceLocation(); E->TInfo = readTypeSourceInfo(); E->SrcExpr = Record.readSubExpr(); + if (HasFPFeatures) + E->setStoredFPFeatures( + FPOptionsOverride::getFromOpaqueInt(Record.readInt())); } void ASTStmtReader::VisitBlockExpr(BlockExpr *E) { @@ -3385,9 +3390,13 @@ Stmt *ASTReader::ReadStmtFromStream(ModuleFile &F) { S = new (Context) ShuffleVectorExpr(Empty); break; - case EXPR_CONVERT_VECTOR: - S = new (Context) ConvertVectorExpr(Empty); + case EXPR_CONVERT_VECTOR: { + BitsUnpacker ConvertVectorExprBits(Record[ASTStmtReader::NumStmtFields]); + ConvertVectorExprBits.advance(ASTStmtReader::NumExprBits); + bool HasFPFeatures = ConvertVectorExprBits.getNextBit(); + S = ConvertVectorExpr::CreateEmpty(Context, HasFPFeatures); break; + } case EXPR_BLOCK: S = new (Context) BlockExpr(Empty); diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index b25dadab656b0..ac80bb46afa2d 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -2016,8 +2016,7 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { // For an expanded parameter pack, record the number of expansion types here // so that it's easier for deserialization to allocate the right amount of // memory. - Expr *TypeConstraint = D->getPlaceholderTypeConstraint(); - Record.push_back(!!TypeConstraint); + Record.push_back(D->hasPlaceholderTypeConstraint()); if (D->isExpandedParameterPack()) Record.push_back(D->getNumExpansionTypes()); @@ -2025,8 +2024,9 @@ void ASTDeclWriter::VisitNonTypeTemplateParmDecl(NonTypeTemplateParmDecl *D) { // TemplateParmPosition. Record.push_back(D->getDepth()); Record.push_back(D->getPosition()); - if (TypeConstraint) - Record.AddStmt(TypeConstraint); + + if (D->hasPlaceholderTypeConstraint()) + Record.AddStmt(D->getPlaceholderTypeConstraint()); if (D->isExpandedParameterPack()) { for (unsigned I = 0, N = D->getNumExpansionTypes(); I != N; ++I) { diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp index 2687231d7820f..82738d3a8c88a 100644 --- a/clang/lib/Serialization/ASTWriterStmt.cpp +++ b/clang/lib/Serialization/ASTWriterStmt.cpp @@ -1335,11 +1335,15 @@ void ASTStmtWriter::VisitShuffleVectorExpr(ShuffleVectorExpr *E) { void ASTStmtWriter::VisitConvertVectorExpr(ConvertVectorExpr *E) { VisitExpr(E); + bool HasFPFeatures = E->hasStoredFPFeatures(); + CurrentPackingBits.addBit(HasFPFeatures); Record.AddSourceLocation(E->getBuiltinLoc()); Record.AddSourceLocation(E->getRParenLoc()); Record.AddTypeSourceInfo(E->getTypeSourceInfo()); Record.AddStmt(E->getSrcExpr()); Code = serialization::EXPR_CONVERT_VECTOR; + if (HasFPFeatures) + Record.push_back(E->getStoredFPFeatures().getAsOpaqueInt()); } void ASTStmtWriter::VisitBlockExpr(BlockExpr *E) { diff --git a/clang/test/AST/ByteCode/cxx20.cpp b/clang/test/AST/ByteCode/cxx20.cpp index 6f65fa5c7cfd3..06501de64916a 100644 --- a/clang/test/AST/ByteCode/cxx20.cpp +++ b/clang/test/AST/ByteCode/cxx20.cpp @@ -626,6 +626,8 @@ namespace ThreeWayCmp { constexpr int k = (1 <=> 1, 0); // both-warning {{comparison result unused}} static_assert(k== 0, ""); + static_assert(__builtin_nanf("") <=> __builtin_nanf("") == -127, ""); + /// Pointers. constexpr int a[] = {1,2,3}; constexpr int b[] = {1,2,3}; diff --git a/clang/test/AST/ByteCode/new-delete.cpp b/clang/test/AST/ByteCode/new-delete.cpp index 7e5f6ab8815ea..5be1bb944c18c 100644 --- a/clang/test/AST/ByteCode/new-delete.cpp +++ b/clang/test/AST/ByteCode/new-delete.cpp @@ -907,6 +907,12 @@ namespace IncompleteArray { return c; } static_assert(test4() == 12); + + + constexpr char *f(int n) { + return new char[n](); + } + static_assert((delete[] f(2), true)); } namespace NonConstexprArrayCtor { diff --git a/clang/test/AST/ast-dump-fpfeatures.cpp b/clang/test/AST/ast-dump-fpfeatures.cpp index cd00650db55cc..eeead3462c0ec 100644 --- a/clang/test/AST/ast-dump-fpfeatures.cpp +++ b/clang/test/AST/ast-dump-fpfeatures.cpp @@ -8,6 +8,17 @@ // RUN: | sed -e "s/ //" -e "s/ imported//" \ // RUN: | FileCheck --strict-whitespace %s +// CHECK-LABEL: FunctionDecl {{.*}} no_fpfeatures_func_01 'vector2float (vector2double)' +// CHECK: CompoundStmt {{.*\>$}} +// CHECK: ReturnStmt +// CHECK: ConvertVectorExpr {{.*}} 'vector2float':'__attribute__((__vector_size__(2 * sizeof(float)))) float'{{$}} + +typedef double vector2double __attribute__((__vector_size__(16))); +typedef float vector2float __attribute__((__vector_size__(8))); +vector2float no_fpfeatures_func_01(vector2double x) { + return __builtin_convertvector(x, vector2float); +} + float func_01(float x); template @@ -248,4 +259,14 @@ __attribute__((optnone)) T func_22(T x, T y) { float func_23(float x, float y) { return func_22(x, y); -} \ No newline at end of file +} + +// CHECK-LABEL: FunctionDecl {{.*}} func_24 'vector2float (vector2double)' +// CHECK: CompoundStmt {{.*}} FPContractMode=2 ConstRoundingMode=towardzero +// CHECK: ReturnStmt +// CHECK: ConvertVectorExpr {{.*}} FPContractMode=2 ConstRoundingMode=towardzero + +#pragma STDC FENV_ROUND FE_TOWARDZERO +vector2float func_24(vector2double x) { + return __builtin_convertvector(x, vector2float); +} diff --git a/clang/test/AST/const-fpfeatures.c b/clang/test/AST/const-fpfeatures.c index 8dc3221b0638a..787bb989dd4a2 100644 --- a/clang/test/AST/const-fpfeatures.c +++ b/clang/test/AST/const-fpfeatures.c @@ -22,6 +22,12 @@ float _Complex C1u = C0; float FLu = 0.1F; // CHECK: @FLu = {{.*}} float 0x3FB99999A0000000 +typedef float vector2float __attribute__((__vector_size__(8))); +typedef double vector2double __attribute__((__vector_size__(16))); +const vector2float V2Fu = {1.0F + 0x0.000001p0F, 1.0F + 0x0.000002p0F}; +vector2double V2Du = __builtin_convertvector(V2Fu, vector2double); +// CHECK: @V2Fu = {{.*}} <2 x float> splat (float 0x3FF0000020000000) +// CHECK: @V2Du = {{.*}} <2 x double> splat (double 0x3FF0000020000000) #pragma STDC FENV_ROUND FE_DOWNWARD @@ -41,3 +47,8 @@ float _Complex C1d = C0; float FLd = 0.1F; // CHECK: @FLd = {{.*}} float 0x3FB9999980000000 + +const vector2float V2Fd = {1.0F + 0x0.000001p0F, 1.0F + 0x0.000002p0F}; +vector2double V2Dd = __builtin_convertvector(V2Fd, vector2double); +// CHECK: @V2Fd = {{.*}} <2 x float> +// CHECK: @V2Dd = {{.*}} <2 x double> diff --git a/clang/test/CIR/emit-actions.cpp b/clang/test/CIR/emit-actions.cpp new file mode 100644 index 0000000000000..94ddf23b34753 --- /dev/null +++ b/clang/test/CIR/emit-actions.cpp @@ -0,0 +1,21 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -S %s -o - | FileCheck %s -check-prefix=ASM + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm-bc %s -o %t.bc +// RUN: llvm-dis %t.bc -o - | FileCheck %s -check-prefix=BC + +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-obj %s -o %t.o +// RUN: llvm-objdump -t %t.o | FileCheck %s -check-prefix=OBJ + +// TODO: Make this test target-independent +// REQUIRES: x86-registered-target + +int x = 1; + +// BC: @x = dso_local global i32 1 + +// ASM: x: +// ASM: .long 1 +// ASM: .size x, 4 + +// OBJ: .data +// OBJ-SAME: x diff --git a/clang/test/CodeGen/AArch64/sincos.c b/clang/test/CodeGen/AArch64/sincos.c index b77d98ceab486..736c0892ed741 100644 --- a/clang/test/CodeGen/AArch64/sincos.c +++ b/clang/test/CodeGen/AArch64/sincos.c @@ -1,6 +1,10 @@ // RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -O1 %s -o - | FileCheck --check-prefix=NO-MATH-ERRNO %s // RUN: %clang_cc1 -triple=aarch64-gnu-linux -emit-llvm -fmath-errno %s -o - | FileCheck --check-prefix=MATH-ERRNO %s +void sincos(double, double*, double*); +void sincosf(float, float*, float*); +void sincosl(long double, long double*, long double*); + // NO-MATH-ERRNO-LABEL: @sincos_f32 // NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}}) // NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0 @@ -12,6 +16,20 @@ // MATH-ERRNO: call void @sincosf( // void sincos_f32(float x, float* fp0, float* fp1) { + sincosf(x, fp0, fp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f32 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { float, float } @llvm.sincos.f32(float {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { float, float } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { float, float } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store float [[SIN]], ptr {{.*}}, align 4, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store float [[COS]], ptr {{.*}}, align 4, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f32 +// MATH-ERRNO: call void @sincosf( +// +void sincos_builtin_f32(float x, float* fp0, float* fp1) { __builtin_sincosf(x, fp0, fp1); } @@ -26,6 +44,20 @@ void sincos_f32(float x, float* fp0, float* fp1) { // MATH-ERRNO: call void @sincos( // void sincos_f64(double x, double* dp0, double* dp1) { + sincos(x, dp0, dp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f64 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { double, double } @llvm.sincos.f64(double {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { double, double } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { double, double } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store double [[SIN]], ptr {{.*}}, align 8, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store double [[COS]], ptr {{.*}}, align 8, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f64 +// MATH-ERRNO: call void @sincos( +// +void sincos_builtin_f64(double x, double* dp0, double* dp1) { __builtin_sincos(x, dp0, dp1); } @@ -40,5 +72,19 @@ void sincos_f64(double x, double* dp0, double* dp1) { // MATH-ERRNO: call void @sincosl( // void sincos_f128(long double x, long double* ldp0, long double* ldp1) { + sincosl(x, ldp0, ldp1); +} + +// NO-MATH-ERRNO-LABEL: @sincos_builtin_f128 +// NO-MATH-ERRNO: [[SINCOS:%.*]] = tail call { fp128, fp128 } @llvm.sincos.f128(fp128 {{.*}}) +// NO-MATH-ERRNO-NEXT: [[SIN:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 0 +// NO-MATH-ERRNO-NEXT: [[COS:%.*]] = extractvalue { fp128, fp128 } [[SINCOS]], 1 +// NO-MATH-ERRNO-NEXT: store fp128 [[SIN]], ptr {{.*}}, align 16, !alias.scope [[SINCOS_ALIAS_SCOPE:![0-9]+]] +// NO-MATH-ERRNO-NEXT: store fp128 [[COS]], ptr {{.*}}, align 16, !noalias [[SINCOS_ALIAS_SCOPE]] +// +// MATH-ERRNO-LABEL: @sincos_builtin_f128 +// MATH-ERRNO: call void @sincosl( +// +void sincos_builtin_f128(long double x, long double* ldp0, long double* ldp1) { __builtin_sincosl(x, ldp0, ldp1); } diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c index d7bf7d57fba26..d5301b7bafd9c 100644 --- a/clang/test/CodeGen/X86/math-builtins.c +++ b/clang/test/CodeGen/X86/math-builtins.c @@ -38,6 +38,24 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // NO__ERRNO-NEXT: [[FREXP_F128_0:%.+]] = extractvalue { fp128, i32 } [[FREXP_F128]], 0 +// NO__ERRNO: [[MODF_F64:%.+]] = call { double, double } @llvm.modf.f64(double %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F64_FP:%.+]] = extractvalue { double, double } [[MODF_F64]], 0 +// NO__ERRNO-NEXT: [[MODF_F64_IP:%.+]] = extractvalue { double, double } [[MODF_F64]], 1 +// NO__ERRNO-NEXT: store double [[MODF_F64_IP]], ptr %{{.+}}, align 8 + +// NO__ERRNO: [[MODF_F32:%.+]] = call { float, float } @llvm.modf.f32(float %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F32_FP:%.+]] = extractvalue { float, float } [[MODF_F32]], 0 +// NO__ERRNO-NEXT: [[MODF_F32_IP:%.+]] = extractvalue { float, float } [[MODF_F32]], 1 +// NO__ERRNO-NEXT: store float [[MODF_F32_IP]], ptr %{{.+}}, align 4 + +// NO__ERRNO: [[MODF_F80:%.+]] = call { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80 %{{.+}}) +// NO__ERRNO-NEXT: [[MODF_F80_FP:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[MODF_F80]], 0 +// NO__ERRNO-NEXT: [[MODF_F80_IP:%.+]] = extractvalue { x86_fp80, x86_fp80 } [[MODF_F80]], 1 +// NO__ERRNO-NEXT: store x86_fp80 [[MODF_F80_IP]], ptr %{{.+}}, align 16 + +// NO__ERRNO: call fp128 @modff128(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) + + // NO__ERRNO: [[SINCOS_F64:%.+]] = call { double, double } @llvm.sincos.f64(double %{{.+}}) // NO__ERRNO-NEXT: [[SINCOS_F64_0:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 0 // NO__ERRNO-NEXT: [[SINCOS_F64_1:%.+]] = extractvalue { double, double } [[SINCOS_F64]], 1 @@ -139,13 +157,13 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { __builtin_modf(f,d); __builtin_modff(f,fp); __builtin_modfl(f,l); __builtin_modff128(f,l); -// NO__ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] -// NO__ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] -// NO__ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] -// NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] -// HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] +// NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE:#[0-9]+]] +// HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_ERRNO: declare fp128 @modff128(fp128 noundef, ptr noundef) [[NOT_READNONE]] __builtin_nan(c); __builtin_nanf(c); __builtin_nanl(c); __builtin_nanf128(c); diff --git a/clang/test/CodeGen/aix-builtin-mapping.c b/clang/test/CodeGen/aix-builtin-mapping.c index a79218c6f1d8b..cc1cc1a44f32c 100644 --- a/clang/test/CodeGen/aix-builtin-mapping.c +++ b/clang/test/CodeGen/aix-builtin-mapping.c @@ -17,6 +17,6 @@ int main() returnValue = __builtin_ldexpl(1.0L, 1); } -// CHECK: %call = call double @modf(double noundef 1.000000e+00, ptr noundef %DummyLongDouble) #3 +// CHECK: %{{.+}} = call { double, double } @llvm.modf.f64(double 1.000000e+00) // CHECK: %{{.+}} = call { double, i32 } @llvm.frexp.f64.i32(double 0.000000e+00) // CHECK: %{{.+}} = call double @llvm.ldexp.f64.i32(double 1.000000e+00, i32 1) diff --git a/clang/test/CodeGen/allow-ubsan-check.c b/clang/test/CodeGen/allow-ubsan-check.c index c116604288546..0cd81a77f5cc5 100644 --- a/clang/test/CodeGen/allow-ubsan-check.c +++ b/clang/test/CodeGen/allow-ubsan-check.c @@ -86,7 +86,7 @@ int div(int x, int y) { } // CHECK-LABEL: define dso_local i32 @null( -// CHECK-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // @@ -102,7 +102,7 @@ int div(int x, int y) { // CHECK-NEXT: ret i32 [[TMP2]] // // TR-LABEL: define dso_local i32 @null( -// TR-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// TR-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // TR-NEXT: [[ENTRY:.*:]] // TR-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // TR-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] @@ -116,7 +116,7 @@ int div(int x, int y) { // TR-NEXT: ret i32 [[TMP2]] // // REC-LABEL: define dso_local i32 @null( -// REC-SAME: ptr noundef readonly captures(address_is_null) [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { +// REC-SAME: ptr noundef readonly [[X:%.*]]) local_unnamed_addr #[[ATTR0]] { // REC-NEXT: [[ENTRY:.*:]] // REC-NEXT: [[TMP0:%.*]] = icmp eq ptr [[X]], null, !nosanitize [[META2]] // REC-NEXT: [[TMP1:%.*]] = tail call i1 @llvm.allow.ubsan.check(i8 29), !nosanitize [[META2]] diff --git a/clang/test/CodeGen/builtin-attributes.c b/clang/test/CodeGen/builtin-attributes.c index e5b0faccfd23f..506b165fcf36e 100644 --- a/clang/test/CodeGen/builtin-attributes.c +++ b/clang/test/CodeGen/builtin-attributes.c @@ -24,6 +24,11 @@ char* f2(char* a, char* b) { return __builtin_strstr(a, b); } +// Note: Use asm label to disable intrinsic lowering of modf. +double modf(double x, double*) asm("modf"); +float modff(float x, float*) asm("modff"); +long double modfl(long double x, long double*) asm("modfl"); + // frexp is NOT readnone. It writes to its pointer argument. // // CHECK: f3 @@ -55,9 +60,9 @@ int f3(double x) { frexp(x, &e); frexpf(x, &e); frexpl(x, &e); - __builtin_modf(x, &e); - __builtin_modff(x, &e); - __builtin_modfl(x, &e); + modf(x, &e); + modff(x, &e); + modfl(x, &e); __builtin_remquo(x, x, &e); __builtin_remquof(x, x, &e); __builtin_remquol(x, x, &e); diff --git a/clang/test/CodeGen/math-builtins-long.c b/clang/test/CodeGen/math-builtins-long.c index 183349e0f0173..87e64a2eaa1c3 100644 --- a/clang/test/CodeGen/math-builtins-long.c +++ b/clang/test/CodeGen/math-builtins-long.c @@ -58,9 +58,9 @@ void foo(long double f, long double *l, int *i, const char *c) { // PPCF128: call fp128 @ldexpf128(fp128 noundef %{{.+}}, {{(signext)?.+}}) __builtin_ldexpl(f,f); - // F80: call x86_fp80 @modfl(x86_fp80 noundef %{{.+}}, ptr noundef %{{.+}}) - // PPC: call ppc_fp128 @modfl(ppc_fp128 noundef %{{.+}}, ptr noundef %{{.+}}) - // X86F128: call fp128 @modfl(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) + // F80: call { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80 %{{.+}}) + // PPC: call { ppc_fp128, ppc_fp128 } @llvm.modf.ppcf128(ppc_fp128 %{{.+}}) + // X86F128: call { fp128, fp128 } @llvm.modf.f128(fp128 %{{.+}}) // PPCF128: call fp128 @modff128(fp128 noundef %{{.+}}, ptr noundef %{{.+}}) __builtin_modfl(f,l); diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c index bcc61c8f046b4..ad297828f48ed 100644 --- a/clang/test/CodeGen/math-libcalls.c +++ b/clang/test/CodeGen/math-libcalls.c @@ -83,12 +83,12 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { modf(f,d); modff(f,fp); modfl(f,l); - // NO__ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] - // NO__ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] - // NO__ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] - // HAS_ERRNO: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] + // NO__ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] + // NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { double, double } @llvm.modf.f64(double) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { float, float } @llvm.modf.f32(float) [[READNONE_INTRINSIC]] + // HAS_ERRNO: declare { x86_fp80, x86_fp80 } @llvm.modf.f80(x86_fp80) [[READNONE_INTRINSIC]] // HAS_MAYTRAP: declare double @modf(double noundef, ptr noundef) [[NOT_READNONE]] // HAS_MAYTRAP: declare float @modff(float noundef, ptr noundef) [[NOT_READNONE]] // HAS_MAYTRAP: declare x86_fp80 @modfl(x86_fp80 noundef, ptr noundef) [[NOT_READNONE]] @@ -660,6 +660,17 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) { // HAS_MAYTRAP: declare float @llvm.experimental.constrained.sinh.f32( // HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.sinh.f80( +sincos(f, d, d); sincosf(f, fp, fp); sincosl(f, l, l); + +// NO__ERRNO: declare { double, double } @llvm.sincos.f64(double) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { float, float } @llvm.sincos.f32(float) [[READNONE_INTRINSIC]] +// NO__ERRNO: declare { x86_fp80, x86_fp80 } @llvm.sincos.f80(x86_fp80) [[READNONE_INTRINSIC]] +// HAS_ERRNO: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_ERRNO: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_ERRNO: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincos(double noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincosf(float noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] +// HAS_MAYTRAP: declare void @sincosl(x86_fp80 noundef, ptr noundef, ptr noundef) [[NOT_READNONE]] sqrt(f); sqrtf(f); sqrtl(f); diff --git a/clang/test/CodeGen/pragma-fenv_access.c b/clang/test/CodeGen/pragma-fenv_access.c index afca115ed08d1..347e9670c4742 100644 --- a/clang/test/CodeGen/pragma-fenv_access.c +++ b/clang/test/CodeGen/pragma-fenv_access.c @@ -242,3 +242,12 @@ float func_20(float x, float y) { // CHECK-LABEL: @func_20 // STRICT: call float @llvm.experimental.constrained.fadd.f32(float {{.*}}, float {{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict") // DEFAULT: fadd float + +typedef double vector4double __attribute__((__vector_size__(32))); +typedef float vector4float __attribute__((__vector_size__(16))); +vector4float func_21(vector4double x) { + #pragma STDC FENV_ROUND FE_UPWARD + return __builtin_convertvector(x, vector4float); +} +// CHECK-LABEL: @func_21 +// STRICT: call <4 x float> @llvm.experimental.constrained.fptrunc.v4f32.v4f64(<4 x double> {{.*}}, metadata !"round.upward", metadata !"fpexcept.strict") diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu index 47fa3967fe237..37fca614c3111 100644 --- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu +++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu @@ -11,7 +11,7 @@ // RUN: -fnative-half-arguments-and-returns | FileCheck -check-prefix=SAFE %s // RUN: %clang_cc1 -x hip %s -O3 -S -o - -triple=amdgcn-amd-amdhsa \ -// RUN: -fcuda-is-device -target-cpu gfx940 -fnative-half-type \ +// RUN: -fcuda-is-device -target-cpu gfx942 -fnative-half-type \ // RUN: -fnative-half-arguments-and-returns -munsafe-fp-atomics \ // RUN: | FileCheck -check-prefix=UNSAFE %s diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp index 3662a270713b6..83daf57be22ff 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp @@ -3,7 +3,7 @@ // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -o - -emit-llvm | FileCheck %s -// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z6upcastP1B(ptr noundef readnone returned %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } @@ -22,12 +22,12 @@ // CHECK: declare ptr @__dynamic_cast(ptr, ptr, ptr, i64) local_unnamed_addr -// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned captures(ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z8selfcastP1B(ptr noundef readnone returned %b) local_unnamed_addr // CHECK-NEXT: entry // CHECK-NEXT: ret ptr %b // CHECK-NEXT: } -// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly captures(address_is_null, ret: address, provenance) %b) local_unnamed_addr +// CHECK: define{{.*}} ptr @_Z9void_castP1B(ptr noundef readonly %b) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %b, null // CHECK-NEXT: br i1 [[isnull]], label %[[dynamic_cast_end:[a-z0-9._]+]], label %[[dynamic_cast_notnull:[a-z0-9._]+]] diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp index 2a838708ca231..c471e5dbd7b33 100644 --- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp +++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp @@ -24,7 +24,7 @@ // CHECK-NEXT: ret ptr @_ZTS1A // CHECK-NEXT: } -// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly captures(address_is_null) %a) local_unnamed_addr +// CHECK: define{{.*}} i1 @_Z5equalP1A(ptr noundef readonly %a) local_unnamed_addr // CHECK-NEXT: entry: // CHECK-NEXT: [[isnull:%[0-9]+]] = icmp eq ptr %a, null // CHECK-NEXT: br i1 [[isnull]], label %[[bad_typeid:[a-z0-9._]+]], label %[[end:[a-z0-9.+]+]] diff --git a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl index 62fd20c4d1414..0aadaad2dca5c 100644 --- a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl +++ b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl @@ -22,7 +22,7 @@ __amdgpu_buffer_rsrc_t getBuffer(void *p) { } // CHECK-LABEL: define {{[^@]+}}@consumeBufferPtr -// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(5) [[P]], addrspacecast (ptr null to ptr addrspace(5)) // CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]] @@ -39,7 +39,7 @@ void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) { } // CHECK-LABEL: define {{[^@]+}}@test -// CHECK-SAME: (ptr addrspace(5) noundef readonly captures(address) [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { +// CHECK-SAME: (ptr addrspace(5) noundef readonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] { // CHECK-NEXT: entry: // CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]] // CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl index 633f1dec5e370..d12dcead6fadf 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-features.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl @@ -29,8 +29,6 @@ // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx909 -emit-llvm -o - %s | FileCheck --check-prefix=GFX909 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90a -emit-llvm -o - %s | FileCheck --check-prefix=GFX90A %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx90c -emit-llvm -o - %s | FileCheck --check-prefix=GFX90C %s -// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx940 -emit-llvm -o - %s | FileCheck --check-prefix=GFX940 %s -// RUN: %clang_cc1 -triple amdgcn -target-cpu gfx941 -emit-llvm -o - %s | FileCheck --check-prefix=GFX941 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx942 -emit-llvm -o - %s | FileCheck --check-prefix=GFX942 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx950 -emit-llvm -o - %s | FileCheck --check-prefix=GFX950 %s // RUN: %clang_cc1 -triple amdgcn -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck --check-prefix=GFX1010 %s @@ -85,8 +83,6 @@ // GFX909: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90A: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX90C: "target-features"="+16-bit-insts,+ci-insts,+dpp,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" -// GFX940: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" -// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts" // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" // GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64" diff --git a/clang/test/CodeGenOpenCL/as_type.cl b/clang/test/CodeGenOpenCL/as_type.cl index 2c6cdc3810b4d..1fe26fbeafdb4 100644 --- a/clang/test/CodeGenOpenCL/as_type.cl +++ b/clang/test/CodeGenOpenCL/as_type.cl @@ -67,7 +67,7 @@ int3 f8(char16 x) { return __builtin_astype(x, int3); } -//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone captures(ret: address, provenance) %[[x:.*]]) +//CHECK: define{{.*}} spir_func noundef ptr addrspace(1) @addr_cast(ptr noundef readnone %[[x:.*]]) //CHECK: %[[cast:.*]] ={{.*}} addrspacecast ptr %[[x]] to ptr addrspace(1) //CHECK: ret ptr addrspace(1) %[[cast]] global int* addr_cast(int *x) { diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl index 6593a8de566f6..f300b05fe798a 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s typedef float v2f __attribute__((ext_vector_type(2))); diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl similarity index 98% rename from clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl rename to clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl index a2f14c652c828..789f6e07240d7 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx942.cl @@ -1,5 +1,5 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // REQUIRES: amdgpu-registered-target typedef unsigned int u32; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl index 521121f5e7e54..c91cf158948b9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950-err.cl @@ -2,7 +2,7 @@ // RUN: -verify -o - %s // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx90a -emit-llvm \ // RUN: -verify -o - %s -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 -emit-llvm \ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 -emit-llvm \ // RUN: -verify -o - %s // RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm \ // RUN: -verify -o - %s diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl index 45d2fa18efd53..b3367202f824e 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gws-insts.cl @@ -5,7 +5,7 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx906 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90c -emit-llvm -o - %s | FileCheck %s -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -emit-llvm -o - %s | FileCheck %s // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl index 00346baa6ff84..79083c3c5f0f9 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-mfma.cl @@ -1,7 +1,7 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx908 -DMFMA_GFX908_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX908 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx90a -DMFMA_GFX90A_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX90A -// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -DMFMA_GFX940_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940 +// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx942 -DMFMA_GFX942_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX942 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx950 -DMFMA_GFX950_TESTS -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX950 #pragma OPENCL EXTENSION cl_khr_fp64:enable @@ -226,189 +226,189 @@ void test_mfma_f64_4x4x4f64(global double* out, double a, double b, double c) #endif // MFMA_GFX90A_TESTS -#if defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) -// CHECK-GFX940-LABEL: @test_mfma_i32_16x16x32_i8 -// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) +#if defined(MFMA_GFX942_TESTS) || defined(MFMA_GFX950_TESTS) +// CHECK-GFX942-LABEL: @test_mfma_i32_16x16x32_i8 +// CHECK-GFX942: call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x32.i8(i64 %a, i64 %b, <4 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_16x16x32_i8(global v4i* out, long a, long b, v4i c) { *out = __builtin_amdgcn_mfma_i32_16x16x32_i8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_i32_32x32x16_i8 -// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_i32_32x32x16_i8 +// CHECK-GFX942: call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x16.i8(i64 %a, i64 %b, <16 x i32> %c, i32 0, i32 0, i32 0) void test_mfma_i32_32x32x16_i8(global v16i* out, long a, long b, v16i c) { *out = __builtin_amdgcn_mfma_i32_32x32x16_i8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x8_xf32 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x8_xf32 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x8.xf32(<2 x float> %a, <2 x float> %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x8_xf32(global v4f* out, v2f a, v2f b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x8_xf32(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x4_xf32 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x4_xf32 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x4.xf32(<2 x float> %a, <2 x float> %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x4_xf32(global v16f* out, v2f a, v2f b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x4_xf32(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_bf8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_bf8_bf8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_bf8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_bf8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.bf8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_bf8_fp8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_bf8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_fp8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.bf8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_fp8_bf8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_16x16x32_fp8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_16x16x32_fp8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.mfma.f32.16x16x32.fp8.fp8(i64 %a, i64 %b, <4 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_16x16x32_fp8_fp8(global v4f* out, long a, long b, v4f c) { *out = __builtin_amdgcn_mfma_f32_16x16x32_fp8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_bf8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_bf8_bf8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_bf8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_bf8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.bf8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_bf8_fp8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_bf8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_fp8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.bf8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_fp8_bf8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_bf8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_mfma_f32_32x32x16_fp8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_mfma_f32_32x32x16_fp8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.fp8.fp8(i64 %a, i64 %b, <16 x float> %c, i32 0, i32 0, i32 0) void test_mfma_f32_32x32x16_fp8_fp8(global v16f* out, long a, long b, v16f c) { *out = __builtin_amdgcn_mfma_f32_32x32x16_fp8_fp8(a, b, c, 0, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_f16 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x32_f16 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.f16(<4 x half> %a, <8 x half> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x32_f16(global v4f* out, v4h a, v8h b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x32_f16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_f16 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x16_f16 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.f16(<4 x half> %a, <8 x half> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x16_f16(global v16f* out, v4h a, v8h b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x16_f16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x32_bf16 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x32_bf16 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x32.bf16(<4 x i16> %a, <8 x i16> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x32_bf16(global v4f* out, v4s a, v8s b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x32_bf16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x16_bf16 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x16_bf16 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x16.bf16(<4 x i16> %a, <8 x i16> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x16_bf16(global v16f* out, v4s a, v8s b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x16_bf16(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_i32_16x16x64_i8 -// CHECK-GFX940: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_i32_16x16x64_i8 +// CHECK-GFX942: call <4 x i32> @llvm.amdgcn.smfmac.i32.16x16x64.i8(<2 x i32> %a, <4 x i32> %b, <4 x i32> %c, i32 %idx, i32 0, i32 0) void test_smfmac_i32_16x16x64_i8(global v4i* out, v2i a, v4i b, v4i c, int idx) { *out = __builtin_amdgcn_smfmac_i32_16x16x64_i8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_i32_32x32x32_i8 -// CHECK-GFX940: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_i32_32x32x32_i8 +// CHECK-GFX942: call <16 x i32> @llvm.amdgcn.smfmac.i32.32x32x32.i8(<2 x i32> %a, <4 x i32> %b, <16 x i32> %c, i32 %idx, i32 0, i32 0) void test_smfmac_i32_32x32x32_i8(global v16i* out, v2i a, v4i b, v16i c, int idx) { *out = __builtin_amdgcn_smfmac_i32_32x32x32_i8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_bf8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_bf8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_bf8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_bf8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_bf8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_fp8_bf8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_fp8_bf8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8 -// CHECK-GFX940: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_16x16x64_fp8_fp8 +// CHECK-GFX942: call <4 x float> @llvm.amdgcn.smfmac.f32.16x16x64.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <4 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_16x16x64_fp8_fp8(global v4f* out, v2i a, v4i b, v4f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_16x16x64_fp8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_bf8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_bf8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_bf8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.bf8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_bf8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_bf8_fp8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_fp8_bf8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.bf8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_fp8_bf8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_bf8(a, b, c, idx, 0, 0); } -// CHECK-GFX940-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8 -// CHECK-GFX940: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) +// CHECK-GFX942-LABEL: @test_smfmac_f32_32x32x32_fp8_fp8 +// CHECK-GFX942: call <16 x float> @llvm.amdgcn.smfmac.f32.32x32x32.fp8.fp8(<2 x i32> %a, <4 x i32> %b, <16 x float> %c, i32 %idx, i32 0, i32 0) void test_smfmac_f32_32x32x32_fp8_fp8(global v16f* out, v2i a, v4i b, v16f c, int idx) { *out = __builtin_amdgcn_smfmac_f32_32x32x32_fp8_fp8(a, b, c, idx, 0, 0); } -#endif // defined(MFMA_GFX940_TESTS) || defined(MFMA_GFX950_TESTS) +#endif // defined(MFMA_GFX942_TESTS) || defined(MFMA_GFX950_TESTS) #ifdef MFMA_GFX950_TESTS diff --git a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl similarity index 84% rename from clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl rename to clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl index 832d7df00db14..24d05fe3a8525 100644 --- a/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx940.cl +++ b/clang/test/CodeGenOpenCL/builtins-fp-atomics-gfx942.cl @@ -1,8 +1,8 @@ -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ // RUN: %s -emit-llvm -o - | FileCheck %s -// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx940 \ -// RUN: -S -o - %s | FileCheck -check-prefix=GFX940 %s +// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -target-cpu gfx942 \ +// RUN: -S -o - %s | FileCheck -check-prefix=GFX942 %s // REQUIRES: amdgpu-registered-target @@ -12,8 +12,8 @@ typedef short __attribute__((ext_vector_type(2))) short2; // CHECK-LABEL: test_flat_add_f32 // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, float %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}} -// GFX940-LABEL: test_flat_add_f32 -// GFX940: flat_atomic_add_f32 +// GFX942-LABEL: test_flat_add_f32 +// GFX942: flat_atomic_add_f32 half2 test_flat_add_f32(__generic float *addr, float x) { return __builtin_amdgcn_flat_atomic_fadd_f32(addr, x); } @@ -21,8 +21,8 @@ half2 test_flat_add_f32(__generic float *addr, float x) { // CHECK-LABEL: test_flat_add_2f16 // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x half> %{{.+}} syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} -// GFX940-LABEL: test_flat_add_2f16 -// GFX940: flat_atomic_pk_add_f16 +// GFX942-LABEL: test_flat_add_2f16 +// GFX942: flat_atomic_pk_add_f16 half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { return __builtin_amdgcn_flat_atomic_fadd_v2f16(addr, x); } @@ -32,8 +32,8 @@ half2 test_flat_add_2f16(__generic half2 *addr, half2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_flat_add_2bf16 -// GFX940: flat_atomic_pk_add_bf16 +// GFX942-LABEL: test_flat_add_2bf16 +// GFX942: flat_atomic_pk_add_bf16 short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { return __builtin_amdgcn_flat_atomic_fadd_v2bf16(addr, x); } @@ -43,8 +43,8 @@ short2 test_flat_add_2bf16(__generic short2 *addr, short2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(1) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_global_add_2bf16 -// GFX940: global_atomic_pk_add_bf16 +// GFX942-LABEL: test_global_add_2bf16 +// GFX942: global_atomic_pk_add_bf16 short2 test_global_add_2bf16(__global short2 *addr, short2 x) { return __builtin_amdgcn_global_atomic_fadd_v2bf16(addr, x); } @@ -55,24 +55,24 @@ short2 test_global_add_2bf16(__global short2 *addr, short2 x) { // CHECK: [[RMW:%.+]] = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x bfloat> [[BC0]] syncscope("agent") monotonic, align 4{{$}} // CHECK-NEXT: bitcast <2 x bfloat> [[RMW]] to <2 x i16> -// GFX940-LABEL: test_local_add_2bf16 -// GFX940: ds_pk_add_rtn_bf16 +// GFX942-LABEL: test_local_add_2bf16 +// GFX942: ds_pk_add_rtn_bf16 short2 test_local_add_2bf16(__local short2 *addr, short2 x) { return __builtin_amdgcn_ds_atomic_fadd_v2bf16(addr, x); } // CHECK-LABEL: test_local_add_2f16 // CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 -// GFX940-LABEL: test_local_add_2f16 -// GFX940: ds_pk_add_rtn_f16 +// GFX942-LABEL: test_local_add_2f16 +// GFX942: ds_pk_add_rtn_f16 half2 test_local_add_2f16(__local half2 *addr, half2 x) { return __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); } // CHECK-LABEL: test_local_add_2f16_noret // CHECK: = atomicrmw fadd ptr addrspace(3) %{{.+}}, <2 x half> %{{.+}} monotonic, align 4 -// GFX940-LABEL: test_local_add_2f16_noret -// GFX940: ds_pk_add_f16 +// GFX942-LABEL: test_local_add_2f16_noret +// GFX942: ds_pk_add_f16 void test_local_add_2f16_noret(__local half2 *addr, half2 x) { __builtin_amdgcn_ds_atomic_fadd_v2f16(addr, x); } diff --git a/clang/test/Driver/aarch64-mcpu.c b/clang/test/Driver/aarch64-mcpu.c index 97303510d6881..447ee4bd3a6f9 100644 --- a/clang/test/Driver/aarch64-mcpu.c +++ b/clang/test/Driver/aarch64-mcpu.c @@ -92,7 +92,7 @@ // COBALT-100: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-n2" // RUN: %clang --target=aarch64 -mcpu=grace -### -c %s 2>&1 | FileCheck -check-prefix=GRACE %s -// GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "neoverse-v2" +// GRACE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "grace" // ================== Check whether -mcpu and -mtune accept mixed-case values. // RUN: %clang --target=aarch64 -mcpu=Cortex-a53 -### -c %s 2>&1 | FileCheck -check-prefix=CASE-INSENSITIVE-CA53 %s diff --git a/clang/test/Driver/amdgpu-macros.cl b/clang/test/Driver/amdgpu-macros.cl index d97b2ddb1fc66..35dc190761ca4 100644 --- a/clang/test/Driver/amdgpu-macros.cl +++ b/clang/test/Driver/amdgpu-macros.cl @@ -107,8 +107,6 @@ // RUN: %clang -E -dM -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx909 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90a -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx90c -DFAMILY=GFX9 -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx940 -DFAMILY=GFX9 -// RUN: %clang -E -dM -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx941 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx942 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=64 -DCPU=gfx950 -DFAMILY=GFX9 // RUN: %clang -E -dM -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefixes=ARCH-GCN,FAST_FMAF %s -DWAVEFRONT_SIZE=32 -DCPU=gfx1010 -DFAMILY=GFX10 diff --git a/clang/test/Driver/amdgpu-mcpu.cl b/clang/test/Driver/amdgpu-mcpu.cl index 7c34d3ec6c63a..ad5fd8ebaa6a6 100644 --- a/clang/test/Driver/amdgpu-mcpu.cl +++ b/clang/test/Driver/amdgpu-mcpu.cl @@ -92,8 +92,6 @@ // RUN: %clang -### -target amdgcn -mcpu=gfx909 %s 2>&1 | FileCheck --check-prefix=GFX909 %s // RUN: %clang -### -target amdgcn -mcpu=gfx90a %s 2>&1 | FileCheck --check-prefix=GFX90A %s // RUN: %clang -### -target amdgcn -mcpu=gfx90c %s 2>&1 | FileCheck --check-prefix=GFX90C %s -// RUN: %clang -### -target amdgcn -mcpu=gfx940 %s 2>&1 | FileCheck --check-prefix=GFX940 %s -// RUN: %clang -### -target amdgcn -mcpu=gfx941 %s 2>&1 | FileCheck --check-prefix=GFX941 %s // RUN: %clang -### -target amdgcn -mcpu=gfx942 %s 2>&1 | FileCheck --check-prefix=GFX942 %s // RUN: %clang -### -target amdgcn -mcpu=gfx950 %s 2>&1 | FileCheck --check-prefix=GFX950 %s // RUN: %clang -### -target amdgcn -mcpu=gfx1010 %s 2>&1 | FileCheck --check-prefix=GFX1010 %s @@ -148,8 +146,6 @@ // GFX909: "-target-cpu" "gfx909" // GFX90A: "-target-cpu" "gfx90a" // GFX90C: "-target-cpu" "gfx90c" -// GFX940: "-target-cpu" "gfx940" -// GFX941: "-target-cpu" "gfx941" // GFX942: "-target-cpu" "gfx942" // GFX950: "-target-cpu" "gfx950" // GFX1010: "-target-cpu" "gfx1010" diff --git a/clang/test/Driver/cuda-bad-arch.cu b/clang/test/Driver/cuda-bad-arch.cu index 8c8c5c3401329..85231a5b9705a 100644 --- a/clang/test/Driver/cuda-bad-arch.cu +++ b/clang/test/Driver/cuda-bad-arch.cu @@ -23,7 +23,7 @@ // RUN: | FileCheck -check-prefix OK %s // RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx90a -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s -// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx940 -c %s 2>&1 \ +// RUN: %clang -### -x hip --target=x86_64-linux-gnu -nogpulib -nogpuinc --cuda-gpu-arch=gfx942 -c %s 2>&1 \ // RUN: | FileCheck -check-prefix OK %s // We don't allow using NVPTX/AMDGCN for host compilation. diff --git a/clang/test/Driver/hip-macros.hip b/clang/test/Driver/hip-macros.hip index 3b3afba0b18ca..bd93f9985a774 100644 --- a/clang/test/Driver/hip-macros.hip +++ b/clang/test/Driver/hip-macros.hip @@ -49,15 +49,13 @@ // RUN: %s 2>&1 | FileCheck --check-prefixes=IMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx1100 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=IMAGE,NOWARN %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ -// RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s -// RUN: %clang -E -dM --offload-arch=gfx941 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,NOWARN %s // RUN: %clang -E -dM --offload-arch=gfx1100 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -Xclang -target-feature -Xclang "-image-insts" %s 2>&1 | FileCheck --check-prefixes=IMAGE,WARN %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -Xclang -target-feature -Xclang "+image-insts" %s 2>&1 | FileCheck --check-prefixes=NOIMAGE,WARN %s // NOWARN-NOT: warning // WARN: warning: feature flag '{{[+|-]}}image-insts' is ignored since the feature is read only [-Winvalid-command-line-argument] @@ -68,9 +66,9 @@ // RUN: %clang -E -dM --offload-arch=gfx1100 -nogpuinc -nogpulib \ // RUN: -fgpu-default-stream=per-thread %s 2>&1 | FileCheck --check-prefixes=PTS %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: -fgpu-default-stream=legacy %s 2>&1 | FileCheck --check-prefixes=NOPTS %s -// RUN: %clang -E -dM --offload-arch=gfx940 --cuda-device-only -nogpuinc -nogpulib \ +// RUN: %clang -E -dM --offload-arch=gfx942 --cuda-device-only -nogpuinc -nogpulib \ // RUN: %s 2>&1 | FileCheck --check-prefixes=NOPTS %s // PTS-DAG: #define __HIP_API_PER_THREAD_DEFAULT_STREAM__ 1 // PTS-DAG: #define __HIP_API_PER_THREAD_DEFAULT_STREAM__ 1 diff --git a/clang/test/Driver/print-enabled-extensions/aarch64-grace.c b/clang/test/Driver/print-enabled-extensions/aarch64-grace.c new file mode 100644 index 0000000000000..fde6aee468cdc --- /dev/null +++ b/clang/test/Driver/print-enabled-extensions/aarch64-grace.c @@ -0,0 +1,62 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang --target=aarch64 --print-enabled-extensions -mcpu=grace | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s + +// CHECK: Extensions enabled for the given AArch64 target +// CHECK-EMPTY: +// CHECK-NEXT: Architecture Feature(s) Description +// CHECK-NEXT: FEAT_AES, FEAT_PMULL Enable AES support +// CHECK-NEXT: FEAT_AMUv1 Enable Armv8.4-A Activity Monitors extension +// CHECK-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// CHECK-NEXT: FEAT_BF16 Enable BFloat16 Extension +// CHECK-NEXT: FEAT_BTI Enable Branch Target Identification +// CHECK-NEXT: FEAT_CCIDX Enable Armv8.3-A Extend of the CCSIDR number of sets +// CHECK-NEXT: FEAT_CRC32 Enable Armv8.0-A CRC-32 checksum instructions +// CHECK-NEXT: FEAT_CSV2_2 Enable architectural speculation restriction +// CHECK-NEXT: FEAT_DIT Enable Armv8.4-A Data Independent Timing instructions +// CHECK-NEXT: FEAT_DPB Enable Armv8.2-A data Cache Clean to Point of Persistence +// CHECK-NEXT: FEAT_DPB2 Enable Armv8.5-A Cache Clean to Point of Deep Persistence +// CHECK-NEXT: FEAT_DotProd Enable dot product support +// CHECK-NEXT: FEAT_ETE Enable Embedded Trace Extension +// CHECK-NEXT: FEAT_FCMA Enable Armv8.3-A Floating-point complex number support +// CHECK-NEXT: FEAT_FHM Enable FP16 FML instructions +// CHECK-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// CHECK-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// CHECK-NEXT: FEAT_FRINTTS Enable FRInt[32|64][Z|X] instructions that round a floating-point number to an integer (in FP format) forcing it to fit into a 32- or 64-bit int +// CHECK-NEXT: FEAT_FlagM Enable Armv8.4-A Flag Manipulation instructions +// CHECK-NEXT: FEAT_FlagM2 Enable alternative NZCV format for floating point comparisons +// CHECK-NEXT: FEAT_I8MM Enable Matrix Multiply Int8 Extension +// CHECK-NEXT: FEAT_JSCVT Enable Armv8.3-A JavaScript FP conversion instructions +// CHECK-NEXT: FEAT_LOR Enable Armv8.1-A Limited Ordering Regions extension +// CHECK-NEXT: FEAT_LRCPC Enable support for RCPC extension +// CHECK-NEXT: FEAT_LRCPC2 Enable Armv8.4-A RCPC instructions with Immediate Offsets +// CHECK-NEXT: FEAT_LSE Enable Armv8.1-A Large System Extension (LSE) atomic instructions +// CHECK-NEXT: FEAT_LSE2 Enable Armv8.4-A Large System Extension 2 (LSE2) atomicity rules +// CHECK-NEXT: FEAT_MPAM Enable Armv8.4-A Memory system Partitioning and Monitoring extension +// CHECK-NEXT: FEAT_MTE, FEAT_MTE2 Enable Memory Tagging Extension +// CHECK-NEXT: FEAT_NV, FEAT_NV2 Enable Armv8.4-A Nested Virtualization Enchancement +// CHECK-NEXT: FEAT_PAN Enable Armv8.1-A Privileged Access-Never extension +// CHECK-NEXT: FEAT_PAN2 Enable Armv8.2-A PAN s1e1R and s1e1W Variants +// CHECK-NEXT: FEAT_PAuth Enable Armv8.3-A Pointer Authentication extension +// CHECK-NEXT: FEAT_PMUv3 Enable Armv8.0-A PMUv3 Performance Monitors extension +// CHECK-NEXT: FEAT_RAS, FEAT_RASv1p1 Enable Armv8.0-A Reliability, Availability and Serviceability Extensions +// CHECK-NEXT: FEAT_RDM Enable Armv8.1-A Rounding Double Multiply Add/Subtract instructions +// CHECK-NEXT: FEAT_RNG Enable Random Number generation instructions +// CHECK-NEXT: FEAT_SB Enable Armv8.5-A Speculation Barrier +// CHECK-NEXT: FEAT_SEL2 Enable Armv8.4-A Secure Exception Level 2 extension +// CHECK-NEXT: FEAT_SHA1, FEAT_SHA256 Enable SHA1 and SHA256 support +// CHECK-NEXT: FEAT_SHA3, FEAT_SHA512 Enable SHA512 and SHA3 support +// CHECK-NEXT: FEAT_SM4, FEAT_SM3 Enable SM3 and SM4 support +// CHECK-NEXT: FEAT_SPE Enable Statistical Profiling extension +// CHECK-NEXT: FEAT_SPECRES Enable Armv8.5-A execution and data prediction invalidation instructions +// CHECK-NEXT: FEAT_SSBS, FEAT_SSBS2 Enable Speculative Store Bypass Safe bit +// CHECK-NEXT: FEAT_SVE Enable Scalable Vector Extension (SVE) instructions +// CHECK-NEXT: FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions +// CHECK-NEXT: FEAT_SVE_AES, FEAT_SVE_PMULL128 Enable SVE AES and quadword SVE polynomial multiply instructions +// CHECK-NEXT: FEAT_SVE_BitPerm Enable bit permutation SVE2 instructions +// CHECK-NEXT: FEAT_SVE_SHA3 Enable SHA3 SVE2 instructions +// CHECK-NEXT: FEAT_SVE_SM4 Enable SM4 SVE2 instructions +// CHECK-NEXT: FEAT_TLBIOS, FEAT_TLBIRANGE Enable Armv8.4-A TLB Range and Maintenance instructions +// CHECK-NEXT: FEAT_TRBE Enable Trace Buffer Extension +// CHECK-NEXT: FEAT_TRF Enable Armv8.4-A Trace extension +// CHECK-NEXT: FEAT_UAO Enable Armv8.2-A UAO PState +// CHECK-NEXT: FEAT_VHE Enable Armv8.1-A Virtual Host extension \ No newline at end of file diff --git a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c index 642d2df211c21..9ef44b2bb403e 100644 --- a/clang/test/Misc/target-invalid-cpu-note/amdgcn.c +++ b/clang/test/Misc/target-invalid-cpu-note/amdgcn.c @@ -45,8 +45,6 @@ // CHECK-SAME: {{^}}, gfx909 // CHECK-SAME: {{^}}, gfx90a // CHECK-SAME: {{^}}, gfx90c -// CHECK-SAME: {{^}}, gfx940 -// CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 // CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx1010 diff --git a/clang/test/Misc/target-invalid-cpu-note/nvptx.c b/clang/test/Misc/target-invalid-cpu-note/nvptx.c index 3afcdf8c9fe5c..06ef72878340f 100644 --- a/clang/test/Misc/target-invalid-cpu-note/nvptx.c +++ b/clang/test/Misc/target-invalid-cpu-note/nvptx.c @@ -52,8 +52,6 @@ // CHECK-SAME: {{^}}, gfx90a // CHECK-SAME: {{^}}, gfx90c // CHECK-SAME: {{^}}, gfx9-4-generic -// CHECK-SAME: {{^}}, gfx940 -// CHECK-SAME: {{^}}, gfx941 // CHECK-SAME: {{^}}, gfx942 // CHECK-SAME: {{^}}, gfx950 // CHECK-SAME: {{^}}, gfx10-1-generic diff --git a/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp b/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp new file mode 100644 index 0000000000000..73dff88e506b4 --- /dev/null +++ b/clang/test/Modules/malformed-constraint-template-non-type-parm-decl.cpp @@ -0,0 +1,55 @@ +// RUN: rm -rf %t +// RUN: split-file %s %t +// RUN: cd %t + +// RUN: %clang_cc1 -std=c++20 mod.cppm -emit-module-interface -o mod.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++20 main.cpp -fmodule-file=mod=mod.pcm -verify -fallow-pcm-with-compiler-errors -fsyntax-only -ast-dump-all | FileCheck %s + +// RUN: %clang_cc1 -std=c++20 mod.cppm -emit-reduced-module-interface -o mod.pcm -fallow-pcm-with-compiler-errors -verify +// RUN: %clang_cc1 -std=c++20 main.cpp -fmodule-file=mod=mod.pcm -verify -fallow-pcm-with-compiler-errors -fsyntax-only -ast-dump-all | FileCheck %s + +//--- mod.cppm +export module mod; + +template +concept ReferenceOf = Q; + +// expected-error@+2 {{unknown type name 'AngleIsInvalidNow'}} +// expected-error@+1 {{constexpr variable 'angle' must be initialized by a constant expression}} +constexpr struct angle {AngleIsInvalidNow e;} angle; + +// expected-error@+1 {{non-type template argument is not a constant expression}} +template auto R, typename Rep> requires requires(Rep v) {cos(v);} +auto cos(const Rep& q); + +// expected-error@+1 {{non-type template argument is not a constant expression}} +template auto R, typename Rep> requires requires(Rep v) {tan(v);} +auto tan(const Rep& q); + +//--- main.cpp +// expected-no-diagnostics +import mod; + +// CHECK: |-FunctionTemplateDecl {{.*}} col:6 imported in mod hidden invalid cos +// CHECK-NEXT: | |-NonTypeTemplateParmDecl {{.*}} col:34 imported in mod hidden referenced invalid 'ReferenceOf auto' depth 0 index 0 R +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} col:46 imported in mod hidden referenced typename depth 0 index 1 Rep +// CHECK-NEXT: | |-RequiresExpr {{.*}} 'bool' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} col:73 imported in mod hidden referenced v 'Rep' +// CHECK-NEXT: | | `-SimpleRequirement {{.*}} dependent +// CHECK-NEXT: | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | |-UnresolvedLookupExpr {{.*}} '' lvalue (ADL) = 'cos' empty +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'Rep' lvalue ParmVar {{.*}} 'v' 'Rep' non_odr_use_unevaluated +// CHECK-NEXT: | `-FunctionDecl {{.*}} col:6 imported in mod hidden cos 'auto (const Rep &)' +// CHECK-NEXT: | `-ParmVarDecl {{.*}} col:21 imported in mod hidden q 'const Rep &' + +// CHECK: |-FunctionTemplateDecl {{.*}} col:6 imported in mod hidden invalid tan +// CHECK-NEXT: | |-NonTypeTemplateParmDecl {{.*}} col:34 imported in mod hidden referenced invalid 'ReferenceOf auto' depth 0 index 0 R +// CHECK-NEXT: | |-TemplateTypeParmDecl {{.*}} col:46 imported in mod hidden referenced typename depth 0 index 1 Rep +// CHECK-NEXT: | |-RequiresExpr {{.*}} 'bool' +// CHECK-NEXT: | | |-ParmVarDecl {{.*}} col:73 imported in mod hidden referenced v 'Rep' +// CHECK-NEXT: | | `-SimpleRequirement {{.*}} dependent +// CHECK-NEXT: | | `-CallExpr {{.*}} '' +// CHECK-NEXT: | | |-UnresolvedLookupExpr {{.*}} '' lvalue (ADL) = 'tan' empty +// CHECK-NEXT: | | `-DeclRefExpr {{.*}} 'Rep' lvalue ParmVar {{.*}} 'v' 'Rep' non_odr_use_unevaluated +// CHECK-NEXT: | `-FunctionDecl {{.*}} col:6 imported in mod hidden tan 'auto (const Rep &)' +// CHECK-NEXT: | `-ParmVarDecl {{.*}} col:21 imported in mod hidden q 'const Rep &' diff --git a/clang/test/SemaCXX/cxx2c-binding-pack.cpp b/clang/test/SemaCXX/cxx2c-binding-pack.cpp index 62e1da565f2b5..0f10dad3937ba 100644 --- a/clang/test/SemaCXX/cxx2c-binding-pack.cpp +++ b/clang/test/SemaCXX/cxx2c-binding-pack.cpp @@ -218,3 +218,18 @@ auto X = [] () { static_assert(sizeof...(pack3) == 5); }; } // namespace + +namespace GH125165 { + +template +auto f(auto t) { + const auto& [...pack] = t; + // expected-error@-1 {{cannot decompose non-class, non-array type 'char const'}} + (pack, ...); +}; + +void g() { + f('x'); // expected-note {{in instantiation}} +} + +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl similarity index 99% rename from clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl rename to clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl index 0fc2304d51ce0..db0387e9878f2 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx940-param.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx942-param.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify -S -o - %s // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -verify -S -o - %s typedef float v2f __attribute__((ext_vector_type(2))); diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl index d1c134c604dfc..b40b1c841b453 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx950.cl @@ -1,5 +1,5 @@ // REQUIRES: amdgpu-registered-target -// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx940 -verify -S -o - %s +// RUN: %clang_cc1 -triple amdgcn-- -target-cpu gfx942 -verify -S -o - %s typedef float float4 __attribute__((ext_vector_type(4))); typedef float float16 __attribute__((ext_vector_type(16))); diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl b/clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl similarity index 81% rename from clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl rename to clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl index 7cf80f7c92677..0b3f692f33998 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-gfx940-err.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-gfx942-err.cl @@ -1,4 +1,4 @@ -// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -verify=gfx940,expected -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx942 -S -verify=gfx942,expected -o - %s // RUN: %clang_cc1 -cl-std=CL2.0 -O0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify=gfx950,expected -o - %s // REQUIRES: amdgpu-registered-target @@ -8,12 +8,12 @@ void test_global_load_lds_unsupported_size(global u32* src, local u32 *dst, u32 __builtin_amdgcn_global_load_lds(src, dst, size, /*offset=*/0, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, offset, /*aux=*/0); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} __builtin_amdgcn_global_load_lds(src, dst, /*size=*/4, /*offset=*/0, aux); // expected-error{{argument to '__builtin_amdgcn_global_load_lds' must be a constant integer}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx940-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} - __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx940-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/5, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/0, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/3, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/12, /*offset=*/0, /*aux=*/0); // gfx942-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/16, /*offset=*/0, /*aux=*/0); // gfx942-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} + __builtin_amdgcn_global_load_lds(src, dst, /*size=*/-1, /*offset=*/0, /*aux=*/0); // expected-error{{invalid size value}} gfx942-note {{size must be 1, 2, or 4}} gfx950-note {{size must be 1, 2, 4, 12 or 16}} } __attribute__((target("gfx950-insts"))) diff --git a/flang-rt/lib/runtime/io-api-minimal.cpp b/flang-rt/lib/runtime/io-api-minimal.cpp index 8d8c9c6070b04..fdf7183ed5176 100644 --- a/flang-rt/lib/runtime/io-api-minimal.cpp +++ b/flang-rt/lib/runtime/io-api-minimal.cpp @@ -150,7 +150,8 @@ bool IODEF(OutputLogical)(Cookie cookie, bool truth) { // Provide own definition for `std::__libcpp_verbose_abort` to avoid dependency // on the version provided by libc++. -void std::__libcpp_verbose_abort(char const *format, ...) { +void std::__libcpp_verbose_abort(char const *format, ...) noexcept( + noexcept(std::__libcpp_verbose_abort(""))) { va_list list; va_start(list, format); std::vfprintf(stderr, format, list); diff --git a/flang/cmake/modules/AddFlangOffloadRuntime.cmake b/flang/cmake/modules/AddFlangOffloadRuntime.cmake index 8e4f47d18535d..eb0e964559ed5 100644 --- a/flang/cmake/modules/AddFlangOffloadRuntime.cmake +++ b/flang/cmake/modules/AddFlangOffloadRuntime.cmake @@ -98,10 +98,10 @@ macro(enable_omp_offload_compilation files) set(all_amdgpu_architectures "gfx700;gfx701;gfx801;gfx803;gfx900;gfx902;gfx906" - "gfx908;gfx90a;gfx90c;gfx940;gfx1010;gfx1030" + "gfx908;gfx90a;gfx90c;gfx942;gfx950;gfx1010;gfx1030" "gfx1031;gfx1032;gfx1033;gfx1034;gfx1035;gfx1036" "gfx1100;gfx1101;gfx1102;gfx1103;gfx1150;gfx1151" - "gfx1152;gfx1153" + "gfx1152;gfx1153;gfx1200;gfx1201" ) set(all_nvptx_architectures "sm_35;sm_37;sm_50;sm_52;sm_53;sm_60;sm_61;sm_62" diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md index c6c2e29a420ea..5e76d4331f6de 100644 --- a/flang/docs/Directives.md +++ b/flang/docs/Directives.md @@ -45,6 +45,11 @@ A list of non-standard directives supported by Flang times if possible. When `n` is omitted, the compiler should attempt to fully unroll the loop. Some compilers accept an optional `=` before the `n` when `n` is present in the directive. Flang does not. +* `!dir$ unroll_and_jam [N]` control how many times a loop should be unrolled and + jammed. It must be placed immediately before a loop that follows. `N` is an optional + integer that specifying the unrolling factor. When `N` is `0` or `1`, the loop + should not be unrolled at all. If `N` is omitted the optimizer will + selects the number of times to unroll the loop. # Directive Details diff --git a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h index 8d17e4e476d10..c71988d081dd0 100644 --- a/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h +++ b/flang/include/flang/Optimizer/Analysis/AliasAnalysis.h @@ -211,14 +211,14 @@ struct AliasAnalysis { fir::AliasAnalysis::Source getSource(mlir::Value, bool getLastInstantiationPoint = false); + /// Return true, if `ty` is a reference type to a boxed + /// POINTER object or a raw fir::PointerType. + static bool isPointerReference(mlir::Type ty); + private: /// Return true, if `ty` is a reference type to an object of derived type /// that contains a component with POINTER attribute. static bool isRecordWithPointerComponent(mlir::Type ty); - - /// Return true, if `ty` is a reference type to a boxed - /// POINTER object or a raw fir::PointerType. - static bool isPointerReference(mlir::Type ty); }; inline bool operator==(const AliasAnalysis::Source::SourceOrigin &lhs, diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index 21ee1d0517840..75c11301285b3 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -209,6 +209,7 @@ class ParseTreeDumper { NODE(CompilerDirective, Unrecognized) NODE(CompilerDirective, VectorAlways) NODE(CompilerDirective, Unroll) + NODE(CompilerDirective, UnrollAndJam) NODE(parser, ComplexLiteralConstant) NODE(parser, ComplexPart) NODE(parser, ComponentArraySpec) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index c3a02fca5ade8..c2fa9a2228180 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -3349,6 +3349,8 @@ struct StmtFunctionStmt { // !DIR$ IGNORE_TKR [ [(tkrdmac...)] name ]... // !DIR$ LOOP COUNT (n1[, n2]...) // !DIR$ name[=value] [, name[=value]]... = can be : +// !DIR$ UNROLL [N] +// !DIR$ UNROLL_AND_JAM [N] // !DIR$ struct CompilerDirective { UNION_CLASS_BOILERPLATE(CompilerDirective); @@ -3371,10 +3373,13 @@ struct CompilerDirective { struct Unroll { WRAPPER_CLASS_BOILERPLATE(Unroll, std::optional); }; + struct UnrollAndJam { + WRAPPER_CLASS_BOILERPLATE(UnrollAndJam, std::optional); + }; EMPTY_CLASS(Unrecognized); CharBlock source; std::variant, LoopCount, std::list, - VectorAlways, std::list, Unroll, Unrecognized> + VectorAlways, std::list, Unroll, UnrollAndJam, Unrecognized> u; }; @@ -4553,8 +4558,8 @@ WRAPPER_CLASS(OmpReductionInitializerClause, Expr); struct OpenMPDeclareReductionConstruct { TUPLE_CLASS_BOILERPLATE(OpenMPDeclareReductionConstruct); CharBlock source; - std::tuple, - OmpReductionCombiner, std::optional> + std::tuple, + std::optional> t; }; diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 7c217ce2f404c..1b24ed12e04f1 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2205,11 +2205,39 @@ class FirConverter : public Fortran::lower::AbstractConverter { /*full=*/fullUnrollAttr, {}, {}, {}); } + // Enabling unroll and jamming directive without a value. + // For directives with a value, if the value is greater than 1, + // force unrolling with the given factor. Otherwise, disable unrolling and + // jamming. + mlir::LLVM::LoopUnrollAndJamAttr + genLoopUnrollAndJamAttr(std::optional count) { + mlir::BoolAttr falseAttr = + mlir::BoolAttr::get(builder->getContext(), false); + mlir::BoolAttr trueAttr = mlir::BoolAttr::get(builder->getContext(), true); + mlir::IntegerAttr countAttr; + bool shouldUnroll = true; + if (count.has_value()) { + auto unrollingFactor = count.value(); + if (unrollingFactor == 0 || unrollingFactor == 1) { + shouldUnroll = false; + } else { + countAttr = + builder->getIntegerAttr(builder->getI64Type(), unrollingFactor); + } + } + + mlir::BoolAttr disableAttr = shouldUnroll ? falseAttr : trueAttr; + return mlir::LLVM::LoopUnrollAndJamAttr::get( + builder->getContext(), /*disable=*/disableAttr, /*count*/ countAttr, {}, + {}, {}, {}, {}); + } + void addLoopAnnotationAttr( IncrementLoopInfo &info, llvm::SmallVectorImpl &dirs) { mlir::LLVM::LoopVectorizeAttr va; mlir::LLVM::LoopUnrollAttr ua; + mlir::LLVM::LoopUnrollAndJamAttr uja; bool has_attrs = false; for (const auto *dir : dirs) { Fortran::common::visit( @@ -2226,12 +2254,16 @@ class FirConverter : public Fortran::lower::AbstractConverter { ua = genLoopUnrollAttr(u.v); has_attrs = true; }, + [&](const Fortran::parser::CompilerDirective::UnrollAndJam &u) { + uja = genLoopUnrollAndJamAttr(u.v); + has_attrs = true; + }, [&](const auto &) {}}, dir->u); } mlir::LLVM::LoopAnnotationAttr la = mlir::LLVM::LoopAnnotationAttr::get( - builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, {}, {}, - {}, {}, {}, {}, {}, {}, {}, {}, {}); + builder->getContext(), {}, /*vectorize=*/va, {}, /*unroll*/ ua, + /*unroll_and_jam*/ uja, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}); if (has_attrs) info.doLoop.setLoopAnnotationAttr(la); } @@ -2887,6 +2919,9 @@ class FirConverter : public Fortran::lower::AbstractConverter { [&](const Fortran::parser::CompilerDirective::Unroll &) { attachDirectiveToLoop(dir, &eval); }, + [&](const Fortran::parser::CompilerDirective::UnrollAndJam &) { + attachDirectiveToLoop(dir, &eval); + }, [&](const auto &) {}}, dir.u); } diff --git a/flang/lib/Lower/Support/Utils.cpp b/flang/lib/Lower/Support/Utils.cpp index 5a9a839330364..ed2700c42fc55 100644 --- a/flang/lib/Lower/Support/Utils.cpp +++ b/flang/lib/Lower/Support/Utils.cpp @@ -478,9 +478,47 @@ class IsEqualEvaluateExpr { return isEqual(x.proc(), y.proc()) && isEqual(x.arguments(), y.arguments()); } template + static bool isEqual(const Fortran::evaluate::ImpliedDo &x, + const Fortran::evaluate::ImpliedDo &y) { + return isEqual(x.values(), y.values()) && isEqual(x.lower(), y.lower()) && + isEqual(x.upper(), y.upper()) && isEqual(x.stride(), y.stride()); + } + template + static bool isEqual(const Fortran::evaluate::ArrayConstructorValues &x, + const Fortran::evaluate::ArrayConstructorValues &y) { + using Expr = Fortran::evaluate::Expr; + using ImpliedDo = Fortran::evaluate::ImpliedDo; + for (const auto &[xValue, yValue] : llvm::zip(x, y)) { + bool checkElement = Fortran::common::visit( + common::visitors{ + [&](const Expr &v, const Expr &w) { return isEqual(v, w); }, + [&](const ImpliedDo &v, const ImpliedDo &w) { + return isEqual(v, w); + }, + [&](const Expr &, const ImpliedDo &) { return false; }, + [&](const ImpliedDo &, const Expr &) { return false; }, + }, + xValue.u, yValue.u); + if (!checkElement) { + return false; + } + } + return true; + } + static bool isEqual(const Fortran::evaluate::SubscriptInteger &x, + const Fortran::evaluate::SubscriptInteger &y) { + return x == y; + } + template static bool isEqual(const Fortran::evaluate::ArrayConstructor &x, const Fortran::evaluate::ArrayConstructor &y) { - llvm::report_fatal_error("not implemented"); + bool checkCharacterType = true; + if constexpr (A::category == Fortran::common::TypeCategory::Character) { + checkCharacterType = isEqual(*x.LEN(), *y.LEN()); + } + using Base = Fortran::evaluate::ArrayConstructorValues; + return isEqual((Base)x, (Base)y) && + (x.GetType() == y.GetType() && checkCharacterType); } static bool isEqual(const Fortran::evaluate::ImpliedDoIndex &x, const Fortran::evaluate::ImpliedDoIndex &y) { diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp index 873758487ddd0..70fa18ad65b9b 100644 --- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp +++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp @@ -54,10 +54,11 @@ static bool hasGlobalOpTargetAttr(mlir::Value v, fir::AddrOfOp op) { static mlir::Value getOriginalDef(mlir::Value v, fir::AliasAnalysis::Source::Attributes &attributes, - bool &isCapturedInInternalProcedure) { + bool &isCapturedInInternalProcedure, bool &approximateSource) { mlir::Operation *defOp; bool breakFromLoop = false; while (!breakFromLoop && (defOp = v.getDefiningOp())) { + mlir::Type ty = defOp->getResultTypes()[0]; llvm::TypeSwitch(defOp) .Case([&](fir::ConvertOp op) { v = op.getValue(); }) .Case([&](auto op) { @@ -67,6 +68,18 @@ getOriginalDef(mlir::Value v, isCapturedInInternalProcedure |= varIf.isCapturedInInternalProcedure(); }) + .Case([&](auto op) { + if (fir::AliasAnalysis::isPointerReference(ty)) + attributes.set(fir::AliasAnalysis::Attribute::Pointer); + v = op->getOperand(0); + approximateSource = true; + }) + .Case([&](hlfir::DesignateOp op) { + auto varIf = llvm::cast(defOp); + attributes |= getAttrsFromVariable(varIf); + v = op.getMemref(); + approximateSource = true; + }) .Default([&](auto op) { breakFromLoop = true; }); } return v; @@ -609,7 +622,8 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v, attributes.set(Attribute::Pointer); auto def = getOriginalDef(op.getMemref(), attributes, - isCapturedInInternalProcedure); + isCapturedInInternalProcedure, + approximateSource); if (auto addrOfOp = def.template getDefiningOp()) { global = addrOfOp.getSymbol(); diff --git a/flang/lib/Parser/Fortran-parsers.cpp b/flang/lib/Parser/Fortran-parsers.cpp index b5bcb53a12761..cfe9ecb29b0b7 100644 --- a/flang/lib/Parser/Fortran-parsers.cpp +++ b/flang/lib/Parser/Fortran-parsers.cpp @@ -1308,11 +1308,14 @@ constexpr auto vectorAlways{ "VECTOR ALWAYS" >> construct()}; constexpr auto unroll{ "UNROLL" >> construct(maybe(digitString64))}; +constexpr auto unrollAndJam{"UNROLL_AND_JAM" >> + construct(maybe(digitString64))}; TYPE_PARSER(beginDirective >> "DIR$ "_tok >> sourced((construct(ignore_tkr) || construct(loopCount) || construct(assumeAligned) || construct(vectorAlways) || + construct(unrollAndJam) || construct(unroll) || construct( many(construct( diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 2b6c77c08cc58..b39b8737b70c0 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -170,8 +170,8 @@ TYPE_PARSER(sourced( // TYPE_PARSER(construct(nonemptyList(Parser{}))) TYPE_PARSER( // - construct(Parser{}) || - construct(Parser{})) + construct(Parser{}) || + construct(Parser{})) TYPE_PARSER(construct( // Parser{}, @@ -1148,9 +1148,7 @@ TYPE_PARSER(construct( // 2.16 Declare Reduction Construct TYPE_PARSER(sourced(construct( verbatim("DECLARE REDUCTION"_tok), - "(" >> Parser{} / ":", - nonemptyList(Parser{}) / ":", - Parser{} / ")", + "(" >> indirect(Parser{}) / ")", maybe(Parser{})))) // declare-target with list diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index cd91fbe4ea5eb..6260a01897527 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -1851,6 +1851,10 @@ class UnparseVisitor { Word("!DIR$ UNROLL"); Walk(" ", unroll.v); }, + [&](const CompilerDirective::UnrollAndJam &unrollAndJam) { + Word("!DIR$ UNROLL_AND_JAM"); + Walk(" ", unrollAndJam.v); + }, [&](const CompilerDirective::Unrecognized &) { Word("!DIR$ "); Word(x.source.ToString()); @@ -2690,11 +2694,10 @@ class UnparseVisitor { BeginOpenMP(); Word("!$OMP DECLARE REDUCTION "); Put("("); - Walk(std::get(x.t)), Put(" : "); - Walk(std::get>(x.t), ","), Put(" : "); - Walk(std::get(x.t)); + Walk(std::get>(x.t)); Put(")"); Walk(std::get>(x.t)); + Put("\n"); EndOpenMP(); } diff --git a/flang/lib/Semantics/canonicalize-directives.cpp b/flang/lib/Semantics/canonicalize-directives.cpp index b27a27618808b..1a0a0d145b3e2 100644 --- a/flang/lib/Semantics/canonicalize-directives.cpp +++ b/flang/lib/Semantics/canonicalize-directives.cpp @@ -56,7 +56,8 @@ bool CanonicalizeDirectives( static bool IsExecutionDirective(const parser::CompilerDirective &dir) { return std::holds_alternative( dir.u) || - std::holds_alternative(dir.u); + std::holds_alternative(dir.u) || + std::holds_alternative(dir.u); } void CanonicalizationOfDirectives::Post(parser::SpecificationPart &spec) { @@ -115,6 +116,9 @@ void CanonicalizationOfDirectives::Post(parser::Block &block) { [&](parser::CompilerDirective::Unroll &) { CheckLoopDirective(*dir, block, it); }, + [&](parser::CompilerDirective::UnrollAndJam &) { + CheckLoopDirective(*dir, block, it); + }, [&](auto &) {}}, dir->u); } diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 1d6fe6c8d4249..49e507feab580 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -3178,6 +3178,10 @@ bool OmpStructureChecker::CheckReductionOperator( const SourceName &realName{name->symbol->GetUltimate().name()}; valid = llvm::is_contained({"max", "min", "iand", "ior", "ieor"}, realName); + if (!valid) { + auto *misc{name->symbol->detailsIf()}; + valid = misc && misc->kind() == MiscDetails::Kind::ConstructName; + } } if (!valid) { context_.Say(source, diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 91a1b3061e1f9..38888a4dc1461 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -446,6 +446,9 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor { bool Pre(const parser::OpenMPDeclareMapperConstruct &); void Post(const parser::OpenMPDeclareMapperConstruct &) { PopContext(); } + bool Pre(const parser::OpenMPDeclareReductionConstruct &); + void Post(const parser::OpenMPDeclareReductionConstruct &) { PopContext(); } + bool Pre(const parser::OpenMPThreadprivate &); void Post(const parser::OpenMPThreadprivate &) { PopContext(); } @@ -1976,6 +1979,12 @@ bool OmpAttributeVisitor::Pre(const parser::OpenMPDeclareMapperConstruct &x) { return true; } +bool OmpAttributeVisitor::Pre( + const parser::OpenMPDeclareReductionConstruct &x) { + PushContext(x.source, llvm::omp::Directive::OMPD_declare_reduction); + return true; +} + bool OmpAttributeVisitor::Pre(const parser::OpenMPThreadprivate &x) { PushContext(x.source, llvm::omp::Directive::OMPD_threadprivate); const auto &list{std::get(x.t)}; @@ -2309,7 +2318,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) { } if (Symbol * found{currScope().FindSymbol(name.source)}) { - if (found->test(semantics::Symbol::Flag::OmpThreadprivate)) + if (found->GetUltimate().test(semantics::Symbol::Flag::OmpThreadprivate)) return; } diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index e64abe6b50e78..17a6665dfb6a5 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1482,6 +1482,15 @@ class OmpVisitor : public virtual DeclarationVisitor { return false; } + bool Pre(const parser::OpenMPDeclareReductionConstruct &x) { + AddOmpSourceRange(x.source); + parser::OmpClauseList emptyList{std::list{}}; + ProcessReductionSpecifier( + std::get>(x.t).value(), + emptyList); + Walk(std::get>(x.t)); + return false; + } bool Pre(const parser::OmpMapClause &); void Post(const parser::OmpBeginLoopDirective &) { @@ -1732,11 +1741,19 @@ void OmpVisitor::ProcessMapperSpecifier(const parser::OmpMapperSpecifier &spec, void OmpVisitor::ProcessReductionSpecifier( const parser::OmpReductionSpecifier &spec, const parser::OmpClauseList &clauses) { + BeginDeclTypeSpec(); + const auto &id{std::get(spec.t)}; + if (auto procDes{std::get_if(&id.u)}) { + if (auto *name{std::get_if(&procDes->u)}) { + name->symbol = + &MakeSymbol(*name, MiscDetails{MiscDetails::Kind::ConstructName}); + } + } + EndDeclTypeSpec(); // Creating a new scope in case the combiner expression (or clauses) use // reerved identifiers, like "omp_in". This is a temporary solution until // we deal with these in a more thorough way. PushScope(Scope::Kind::OtherConstruct, nullptr); - Walk(std::get(spec.t)); Walk(std::get(spec.t)); Walk(std::get>(spec.t)); Walk(clauses); @@ -9535,7 +9552,8 @@ void ResolveNamesVisitor::Post(const parser::AssignedGotoStmt &x) { void ResolveNamesVisitor::Post(const parser::CompilerDirective &x) { if (std::holds_alternative(x.u) || - std::holds_alternative(x.u)) { + std::holds_alternative(x.u) || + std::holds_alternative(x.u)) { return; } if (const auto *tkr{ diff --git a/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir b/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir new file mode 100644 index 0000000000000..de81841d9249d --- /dev/null +++ b/flang/test/Analysis/AliasAnalysis/load-ptr-designate.fir @@ -0,0 +1,511 @@ +// Check aliasing with the address *in* (not *of*) a pointer component +// (hlfir.designate). +// +// Throughout this test, the ".fir" suffix on symbols indicates a version of the +// MLIR after convert-hlfir-to-fir. A key difference is that component access +// is via fir.coordinate_of instead of hlfir.designate. We would like alias +// analysis results to be the same in both versions. + +// RUN: fir-opt %s -split-input-file -o /dev/null --mlir-disable-threading \ +// RUN: -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' \ +// RUN: 2>&1 | FileCheck -match-full-lines %s + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// end module m +// subroutine test() +// use m +// real, target :: t +// real :: v +// type(ty) :: obj +// type(ty), target :: t_obj +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: NoAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: NoAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest() { + %0 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "obj", uniq_name = "_QFtestEobj"} + %1:2 = hlfir.declare %0 {test.ptr="obj", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "t_obj", fir.target, uniq_name = "_QFtestEt_obj"} + %5:2 = hlfir.declare %4 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7:2 = hlfir.declare %6 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %8 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %11 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %12 = fir.load %11 : !fir.ref>> + %13 = fir.box_addr %12 {test.ptr="obj%p1.tgt"}: (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %14 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %15 = hlfir.designate %1#0{"arr"} <%14> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %16 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %17 = fir.load %16 : !fir.ref>> + %repeat17 = fir.load %16 : !fir.ref>> + %18 = fir.box_addr %17 {test.ptr="obj%alloc.tgt"}: (!fir.box>) -> !fir.heap + %repeat18 = fir.box_addr %repeat17 {test.ptr="obj%alloc.tgt2"}: (!fir.box>) -> !fir.heap + %c2_1 = arith.constant 2 : index + %19 = fir.shape %c2_1 : (index) -> !fir.shape<1> + %c1_2 = arith.constant 1 : index + %20 = hlfir.designate %5#0{"arr"} <%19> (%c1_2) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %21 = hlfir.designate %5#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir() { + %0 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "obj", uniq_name = "_QFtestEobj"} + %1 = fir.declare %0 {test.ptr="obj.fir", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.alloca !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> {bindc_name = "t_obj", fir.target, uniq_name = "_QFtestEt_obj"} + %5 = fir.declare %4 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7 = fir.declare %6 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %8 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %9 = fir.coordinate_of %1, %8 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %10 = fir.load %9 : !fir.ref>> + %11 = fir.box_addr %10 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %12 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %13 = fir.coordinate_of %1, %12 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %14 = fir.load %13 : !fir.ref>> + %15 = fir.box_addr %14 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %18 = fir.coordinate_of %1, %17 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %19 = fir.array_coor %18(%16) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %20 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %21 = fir.coordinate_of %1, %20 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %repeat22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat23 = fir.box_addr %repeat22 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %24 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %25 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %26 = fir.coordinate_of %5, %25 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %27 = fir.array_coor %26(%24) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %28 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %29 = fir.coordinate_of %5, %28 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %30 = fir.load %29 : !fir.ref>> + %31 = fir.box_addr %30 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} + +// ----- + +// Repeat above test except composites are dummy args instead of locals. + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// end module m +// subroutine test(obj, t_obj) +// use m +// type(ty) :: obj +// type(ty), target :: t_obj +// real, target :: t +// real :: v +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// TODO: Thus, we expect all cases below to be NoAlias. However, target dummy +// args are currently indiscrimnately analyzed as MayAlias. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest(%arg0: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "obj"}, %arg1: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "t_obj", fir.target}) { + %0 = fir.dummy_scope : !fir.dscope + %1:2 = hlfir.declare %arg0 dummy_scope %0 {test.ptr="obj", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4:2 = hlfir.declare %arg1 dummy_scope %0 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %5 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %6:2 = hlfir.declare %5 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %7 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %8 = fir.load %7 : !fir.ref>> + %9 = fir.box_addr %8 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %10 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %11 = fir.load %10 : !fir.ref>> + %12 = fir.box_addr %11 {test.ptr="obj%p1.tgt"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %13 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %14 = hlfir.designate %1#0{"arr"} <%13> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %15 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %16 = fir.load %15 : !fir.ref>> + %repeat16 = fir.load %15 : !fir.ref>> + %17 = fir.box_addr %16 {test.ptr="obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + %repeat17 = fir.box_addr %repeat16 {test.ptr="obj%alloc.tgt2"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %18 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %19 = hlfir.designate %4#0{"arr"} <%18> (%c1_1) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %20 = hlfir.designate %4#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %21 = fir.load %20 : !fir.ref>> + %22 = fir.box_addr %21 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir(%arg0: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "obj"}, %arg1: !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> {fir.bindc_name = "t_obj", fir.target}) { + %0 = fir.dummy_scope : !fir.dscope + %1 = fir.declare %arg0 dummy_scope %0 {test.ptr="obj.fir", uniq_name = "_QFtestEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.declare %arg1 dummy_scope %0 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.dscope) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %6 = fir.declare %5 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %7 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %8 = fir.coordinate_of %1, %7 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %11 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %12 = fir.coordinate_of %1, %11 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %13 = fir.load %12 : !fir.ref>> + %14 = fir.box_addr %13 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %15 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %16 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %17 = fir.coordinate_of %1, %16 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %18 = fir.array_coor %17(%15) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %19 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %20 = fir.coordinate_of %1, %19 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %21 = fir.load %20 : !fir.ref>> + %repeat21 = fir.load %20 : !fir.ref>> + %22 = fir.box_addr %21 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat22 = fir.box_addr %repeat21 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %23 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %24 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %25 = fir.coordinate_of %4, %24 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %26 = fir.array_coor %25(%23) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %27 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %28 = fir.coordinate_of %4, %27 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %29 = fir.load %28 : !fir.ref>> + %30 = fir.box_addr %29 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} + +// ----- + +// Repeat above test except composites are globals. + +// module m +// type :: ty +// real, pointer :: p0, p1 +// real :: arr(2) +// real, allocatable :: alloc +// ! target attribute on components is not supported +// end type ty +// type(ty) :: obj +// type(ty), target :: t_obj +// end module m +// subroutine test() +// use m +// real, target :: t +// real :: v +// end subroutine test + +// CHECK-LABEL: Testing : "_QPtest" + +// The address in a pointer can alias the address in another pointer or the +// address of a target but not the address of other variables. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: t#0 <-> obj%p1.tgt#0: MayAlias +// CHECK-DAG: v#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: v#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: t.fir#0 <-> obj%p1.tgt.fir#0: MayAlias +// CHECK-DAG: v.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: v.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// The address in a pointer cannot alias the address of a pointer. +// CHECK-DAG: obj%p0#0 <-> obj%p0.tgt#0: NoAlias +// CHECK-DAG: obj%p0#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.tgt#0 <-> obj%p1#0: NoAlias +// CHECK-DAG: obj%p1#0 <-> obj%p1.tgt#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p0.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.fir#0 <-> obj%p1.tgt.fir#0: NoAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%p1.fir#0: NoAlias +// CHECK-DAG: obj%p1.fir#0 <-> obj%p1.tgt.fir#0: NoAlias + +// For some cases, AliasAnalysis analyzes hlfir.designate like fir.box_addr, so +// make sure it doesn't mistakenly see the address of obj%arr(1) as an address +// that was loaded from a pointer and that could alias something. However, +// t_obj%arr is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%arr(1) are analyzed as +// MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%arr(1)#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%arr(1).fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%arr(1).fir#0: MayAlias + +// Like a pointer, an allocatable contains an address, but an allocatable is not +// a pointer and so cannot alias pointers. However, t_obj%alloc is a target. +// TODO: Thus, we expect the first case (and corresponding .fir case) below to +// be NoAlias. However, the addresses obj%p0.tgt and obj%alloc.tgt are analyzed +// as MayAlias because they have the same source and both are data. +// CHECK-DAG: obj%p0.tgt#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt#0 <-> t_obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias +// CHECK-DAG: obj%p0.tgt.fir#0 <-> t_obj%alloc.tgt.fir#0: MayAlias + +// The address in an allocatable cannot alias the address of that allocatable. +// CHECK-DAG: obj%alloc#0 <-> obj%alloc.tgt#0: NoAlias +// CHECK-DAG: t_obj%alloc#0 <-> t_obj%alloc.tgt#0: NoAlias +// CHECK-DAG: obj%alloc.fir#0 <-> obj%alloc.tgt.fir#0: NoAlias +// CHECK-DAG: t_obj%alloc.fir#0 <-> t_obj%alloc.tgt.fir#0: NoAlias + +// The address of a composite aliases the address of any component but not the +// address in a pointer or allocatable component. +// TODO: Thus, we expect the obj%*.tgt cases below to be NoAlias. However, the +// addresses obj and obj%*.tgt are analyzed as MayAlias because they have the +// same source and both are data. +// CHECK-DAG: obj#0 <-> obj%p0#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%p0.tgt#0: MayAlias +// CHECK-DAG: obj#0 <-> obj%alloc.tgt#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%p0.tgt.fir#0: MayAlias +// CHECK-DAG: obj.fir#0 <-> obj%alloc.tgt.fir#0: MayAlias + +// The addresses obtained via multiple load instructions from the same +// allocatable can alias. +// CHECK-DAG: obj%alloc.tgt#0 <-> obj%alloc.tgt2#0: MayAlias +// CHECK-DAG: obj%alloc.tgt.fir#0 <-> obj%alloc.tgt2.fir#0: MayAlias + +func.func @_QPtest() { + %0 = fir.address_of(@_QMmEobj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %1:2 = hlfir.declare %0 {test.ptr="obj", uniq_name = "_QMmEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3:2 = hlfir.declare %2 {test.ptr="t", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %4 = fir.address_of(@_QMmEt_obj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5:2 = hlfir.declare %4 {test.ptr="t_obj", fortran_attrs = #fir.var_attrs, uniq_name = "_QMmEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7:2 = hlfir.declare %6 {test.ptr="v", uniq_name = "_QFtestEv"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %8 = hlfir.designate %1#0{"p0"} {test.ptr="obj%p0", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %9 = fir.load %8 : !fir.ref>> + %10 = fir.box_addr %9 {test.ptr="obj%p0.tgt"} : (!fir.box>) -> !fir.ptr + %12 = hlfir.designate %1#0{"p1"} {test.ptr="obj%p1", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %13 = fir.load %12 : !fir.ref>> + %14 = fir.box_addr %13 {test.ptr="obj%p1.tgt"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = hlfir.designate %1#0{"arr"} <%16> (%c1) {test.ptr="obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %19 = hlfir.designate %1#0{"alloc"} {test.ptr="obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %20 = fir.load %19 : !fir.ref>> + %repeat20 = fir.load %19 : !fir.ref>> + %21 = fir.box_addr %20 {test.ptr="obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + %repeat21 = fir.box_addr %repeat20 {test.ptr="obj%alloc.tgt2"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %23 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %24 = hlfir.designate %5#0{"arr"} <%23> (%c1_1) {test.ptr="t_obj%arr(1)"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.shape<1>, index) -> !fir.ref + %26 = hlfir.designate %5#0{"alloc"} {test.ptr="t_obj%alloc", fortran_attrs = #fir.var_attrs} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>> + %27 = fir.load %26 : !fir.ref>> + %28 = fir.box_addr %27 {test.ptr="t_obj%alloc.tgt"} : (!fir.box>) -> !fir.heap + return +} + +func.func @_QPtest.fir() { + %0 = fir.address_of(@_QMmEobj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %1 = fir.declare %0 {test.ptr="obj.fir", uniq_name = "_QMmEobj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %2 = fir.alloca f32 {bindc_name = "t", fir.target, uniq_name = "_QFtestEt"} + %3 = fir.declare %2 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QFtestEt"} : (!fir.ref) -> !fir.ref + %4 = fir.address_of(@_QMmEt_obj) : !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %5 = fir.declare %4 {test.ptr="t_obj.fir", fortran_attrs = #fir.var_attrs, uniq_name = "_QMmEt_obj"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>) -> !fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>> + %6 = fir.alloca f32 {bindc_name = "v", uniq_name = "_QFtestEv"} + %7 = fir.declare %6 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref) -> !fir.ref + %8 = fir.field_index p0, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %9 = fir.coordinate_of %1, %8 {test.ptr="obj%p0.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %10 = fir.load %9 : !fir.ref>> + %11 = fir.box_addr %10 {test.ptr = "obj%p0.tgt.fir"} : (!fir.box>) -> !fir.ptr + %12 = fir.field_index p1, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %13 = fir.coordinate_of %1, %12 {test.ptr="obj%p1.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %14 = fir.load %13 : !fir.ref>> + %15 = fir.box_addr %14 {test.ptr = "obj%p1.tgt.fir"} : (!fir.box>) -> !fir.ptr + %c2 = arith.constant 2 : index + %16 = fir.shape %c2 : (index) -> !fir.shape<1> + %c1 = arith.constant 1 : index + %17 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %18 = fir.coordinate_of %1, %17 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %19 = fir.array_coor %18(%16) %c1 {test.ptr="obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %20 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %21 = fir.coordinate_of %1, %20 {test.ptr="obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %22 = fir.load %21 : !fir.ref>> + %repeat22 = fir.load %21 : !fir.ref>> + %23 = fir.box_addr %22 {test.ptr = "obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + %repeat23 = fir.box_addr %repeat22 {test.ptr = "obj%alloc.tgt2.fir"} : (!fir.box>) -> !fir.heap + %c2_0 = arith.constant 2 : index + %24 = fir.shape %c2_0 : (index) -> !fir.shape<1> + %c1_1 = arith.constant 1 : index + %25 = fir.field_index arr, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %26 = fir.coordinate_of %5, %25 : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref> + %27 = fir.array_coor %26(%24) %c1_1 {test.ptr="t_obj%arr(1).fir"} : (!fir.ref>, !fir.shape<1>, index) -> !fir.ref + %28 = fir.field_index alloc, !fir.type<_QMmTty{p0:!fir.box>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}> + %29 = fir.coordinate_of %5, %28 {test.ptr="t_obj%alloc.fir"} : (!fir.ref>,p1:!fir.box>,arr:!fir.array<2xf32>,alloc:!fir.box>}>>, !fir.field) -> !fir.ref>> + %30 = fir.load %29 : !fir.ref>> + %31 = fir.box_addr %30 {test.ptr = "t_obj%alloc.tgt.fir"} : (!fir.box>) -> !fir.heap + return +} diff --git a/flang/test/Integration/unroll_and_jam.f90 b/flang/test/Integration/unroll_and_jam.f90 new file mode 100644 index 0000000000000..771b7fb411855 --- /dev/null +++ b/flang/test/Integration/unroll_and_jam.f90 @@ -0,0 +1,48 @@ +! RUN: %flang_fc1 -emit-llvm -o - %s | FileCheck %s + +! CHECK-LABEL: unroll_and_jam_dir +subroutine unroll_and_jam_dir + integer :: a(10) + !dir$ unroll_and_jam 4 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir + +! CHECK-LABEL: unroll_and_jam_dir_0 +subroutine unroll_and_jam_dir_0 + integer :: a(10) + !dir$ unroll_and_jam 0 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_0 + +! CHECK-LABEL: unroll_and_jam_dir_1 +subroutine unroll_and_jam_dir_1 + integer :: a(10) + !dir$ unroll_and_jam 1 + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_DISABLE]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_1 + +! CHECK-LABEL: unroll_and_jam_dir_no_factor +subroutine unroll_and_jam_dir_no_factor + integer :: a(10) + !dir$ unroll_and_jam + ! CHECK: br i1 {{.*}}, label {{.*}}, label {{.*}}, !llvm.loop ![[ANNOTATION_NO_FACTOR:.*]] + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir_no_factor + +! CHECK: ![[ANNOTATION]] = distinct !{![[ANNOTATION]], ![[UNROLL_AND_JAM:.*]], ![[UNROLL_AND_JAM_COUNT:.*]]} +! CHECK: ![[UNROLL_AND_JAM]] = !{!"llvm.loop.unroll_and_jam.enable"} +! CHECK: ![[UNROLL_AND_JAM_COUNT]] = !{!"llvm.loop.unroll_and_jam.count", i32 4} +! CHECK: ![[ANNOTATION_DISABLE]] = distinct !{![[ANNOTATION_DISABLE]], ![[UNROLL_AND_JAM2:.*]]} +! CHECK: ![[UNROLL_AND_JAM2]] = !{!"llvm.loop.unroll_and_jam.disable"} +! CHECK: ![[ANNOTATION_NO_FACTOR]] = distinct !{![[ANNOTATION_NO_FACTOR]], ![[UNROLL_AND_JAM]]} diff --git a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 index 7a7d28db8d6f5..db50c9ac8ee9d 100644 --- a/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 +++ b/flang/test/Lower/OpenMP/Todo/omp-declare-reduction.f90 @@ -1,10 +1,10 @@ ! This test checks lowering of OpenMP declare reduction Directive. -// RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s +! RUN: not flang -fc1 -emit-fir -fopenmp %s 2>&1 | FileCheck %s subroutine declare_red() integer :: my_var - // CHECK: not yet implemented: OpenMPDeclareReductionConstruct + !CHECK: not yet implemented: OpenMPDeclareReductionConstruct !$omp declare reduction (my_red : integer : omp_out = omp_in) initializer (omp_priv = 0) my_var = 0 end subroutine declare_red diff --git a/flang/test/Lower/OpenMP/atomic-update.f90 b/flang/test/Lower/OpenMP/atomic-update.f90 index 16dae9d5f301c..7d04745015faa 100644 --- a/flang/test/Lower/OpenMP/atomic-update.f90 +++ b/flang/test/Lower/OpenMP/atomic-update.f90 @@ -185,4 +185,19 @@ program OmpAtomicUpdate !$omp atomic update w = max(w,x,y,z) +!CHECK: %[[IMP_DO:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<1>) -> !hlfir.expr { +!CHECK: ^bb0(%{{.*}}: index): +! [...] +!CHECK: %[[ADD_I1:.*]] = arith.addi {{.*}} : i32 +!CHECK: hlfir.yield_element %[[ADD_I1]] : i32 +!CHECK: } +! [...] +!CHECK: %[[SUM:.*]] = hlfir.sum %[[IMP_DO]] +!CHECK: omp.atomic.update %[[VAL_X_DECLARE]]#1 : !fir.ref { +!CHECK: ^bb0(%[[ARG0:.*]]: i32): +!CHECK: %[[ADD_I2:.*]] = arith.addi %[[ARG0]], %[[SUM]] : i32 +!CHECK: omp.yield(%[[ADD_I2]] : i32) +!CHECK: } + !$omp atomic update + x = x + sum([ (y+2, y=1, z) ]) end program OmpAtomicUpdate diff --git a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 index 7d02987c5eade..6201459bc42ca 100644 --- a/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 +++ b/flang/test/Lower/OpenMP/threadprivate-hlfir.f90 @@ -15,8 +15,6 @@ !CHECK: %{{.*}} = fir.call @_FortranAioOutputInteger32(%{{.*}}, %[[TP_VAL]]) fastmath : (!fir.ref, i32) -> i1 !CHECK: omp.terminator -!CHECK: fir.global internal @_QFsubEa : i32 - subroutine sub() integer, save:: a !$omp threadprivate(a) @@ -25,3 +23,55 @@ subroutine sub() !$omp end parallel end subroutine +!CHECK-LABEL: func.func @_QPsub_02() +subroutine sub_02() + integer, save :: a + !$omp threadprivate(a) + !CHECK: %[[ADDR_02:.*]] = fir.address_of(@_QFsub_02Ea) : !fir.ref + !CHECK: %[[DECL_02:.*]]:2 = hlfir.declare %[[ADDR_02]] {{{.*}} uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !CHECK: %[[TP_02:.*]] = omp.threadprivate %[[DECL_02]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_02:.*]]:2 = hlfir.declare %[[TP_02]] {{{.*}} uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + call sub_03 + !CHECK: fir.call @_QFsub_02Psub_03() fastmath : () -> () + !CHECK: return + +contains + + !CHECK-LABEL: func.func private @_QFsub_02Psub_03() + subroutine sub_03() + !CHECK: %[[ADDR_03:.*]] = fir.address_of(@_QFsub_02Ea) : !fir.ref + !CHECK: %[[DECL_03:.*]]:2 = hlfir.declare %[[ADDR_03]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !CHECK: %[[TP_03:.*]] = omp.threadprivate %[[DECL_03]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_03:.*]]:2 = hlfir.declare %[[TP_03]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + !$omp parallel default(private) + !CHECK: omp.parallel + !CHECK: %[[TP_04:.*]] = omp.threadprivate %[[DECL_03]]#1 : !fir.ref -> !fir.ref + !CHECK: %[[TP_DECL_04:.*]]:2 = hlfir.declare %[[TP_04]] {uniq_name = "_QFsub_02Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + print *, a + !CHECK: omp.terminator + !$omp end parallel + end subroutine +end subroutine + +module mod_01 + integer, save :: a + !CHECK: fir.global @_QMmod_01Ea : i32 + !$omp threadprivate(a) +end module + +!CHECK-LABEL: func.func @_QPsub_05() +subroutine sub_05() + use mod_01, only: a + !$omp parallel default(private) + !CHECK: omp.parallel { + !CHECK: %[[TP_05:.*]] = omp.threadprivate %{{.*}} : !fir.ref -> !fir.ref + !CHECK: %{{.*}} = hlfir.declare %[[TP_05]] {uniq_name = "_QMmod_01Ea"} : (!fir.ref) -> (!fir.ref, !fir.ref) + print *, a + !CHECK: omp.terminator + !$omp end parallel +end subroutine + + +!CHECK: fir.global internal @_QFsubEa : i32 + +!CHECK: fir.global internal @_QFsub_02Ea : i32 diff --git a/flang/test/Lower/unroll_and_jam.f90 b/flang/test/Lower/unroll_and_jam.f90 new file mode 100644 index 0000000000000..afc5a7b6b271e --- /dev/null +++ b/flang/test/Lower/unroll_and_jam.f90 @@ -0,0 +1,34 @@ +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s + +! CHECK: #loop_unroll_and_jam = #llvm.loop_unroll_and_jam +! CHECK: #loop_unroll_and_jam1 = #llvm.loop_unroll_and_jam +! CHECK: #loop_annotation = #llvm.loop_annotation +! CHECK: #loop_annotation1 = #llvm.loop_annotation + +! CHECK-LABEL: unroll_and_jam_dir +subroutine unroll_and_jam_dir + integer :: a(10) + !dir$ unroll_and_jam + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation} + do i=1,10 + a(i)=i + end do + + !dir$ unroll_and_jam 2 + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation1} + do i=1,10 + a(i)=i + end do +end subroutine unroll_and_jam_dir + + +! CHECK-LABEL: intermediate_directive +subroutine intermediate_directive + integer :: a(10) + !dir$ unroll_and_jam + !dir$ unknown + !CHECK: fir.do_loop {{.*}} attributes {loopAnnotation = #loop_annotation} + do i=1,10 + a(i)=i + end do +end subroutine intermediate_directive diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 new file mode 100644 index 0000000000000..a2a3ef9f630ab --- /dev/null +++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 @@ -0,0 +1,21 @@ +! RUN: %flang_fc1 -fdebug-unparse -fopenmp %s | FileCheck --ignore-case %s +! RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp %s | FileCheck --check-prefix="PARSE-TREE" %s +!CHECK-LABEL: program main +program main + integer :: my_var + !CHECK: !$OMP DECLARE REDUCTION (my_add_red:INTEGER: omp_out=omp_out+omp_in + !CHECK-NEXT: ) INITIALIZER(OMP_PRIV = 0_4) + + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + my_var = 0 + !$omp parallel reduction (my_add_red : my_var) num_threads(4) + my_var = omp_get_thread_num() + 1 + !$omp end parallel + print *, "sum of thread numbers is ", my_var +end program main + +!PARSE-TREE: OpenMPDeclareReductionConstruct +!PARSE-TREE: OmpReductionIdentifier -> ProcedureDesignator -> Name = 'my_add_red' +!PARSE-TREE: DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec +!PARSE-TREE: OmpReductionCombiner -> AssignmentStmt = 'omp_out=omp_out+omp_in' +!PARSE-TREE: OmpReductionInitializerClause -> Expr = '0_4' diff --git a/flang/test/Parser/compiler-directives.f90 b/flang/test/Parser/compiler-directives.f90 index f372a9f533a35..d1e386a01dd4d 100644 --- a/flang/test/Parser/compiler-directives.f90 +++ b/flang/test/Parser/compiler-directives.f90 @@ -46,3 +46,14 @@ subroutine unroll do i=1,10 enddo end subroutine + +subroutine unroll_and_jam + !dir$ unroll_and_jam + ! CHECK: !DIR$ UNROLL_AND_JAM + do i=1,10 + enddo + !dir$ unroll_and_jam 2 + ! CHECK: !DIR$ UNROLL_AND_JAM 2 + do i=1,10 + enddo +end subroutine diff --git a/flang/test/Semantics/OpenMP/declarative-directive01.f90 b/flang/test/Semantics/OpenMP/declarative-directive01.f90 index 17dc50b70e542..e8bf605565fad 100644 --- a/flang/test/Semantics/OpenMP/declarative-directive01.f90 +++ b/flang/test/Semantics/OpenMP/declarative-directive01.f90 @@ -2,9 +2,6 @@ ! Check OpenMP declarative directives -!TODO: all internal errors -! enable declare-reduction example after name resolution - ! 2.4 requires subroutine requires_1(a) @@ -88,15 +85,14 @@ end module m2 ! 2.16 declare-reduction -! subroutine declare_red_1() -! use omp_lib -! integer :: my_var -! !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) -! my_var = 0 -! !$omp parallel reduction (my_add_red : my_var) num_threads(4) -! my_var = omp_get_thread_num() + 1 -! !$omp end parallel -! print *, "sum of thread numbers is ", my_var -! end subroutine declare_red_1 +subroutine declare_red_1() + integer :: my_var + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + my_var = 0 + !$omp parallel reduction (my_add_red : my_var) num_threads(4) + my_var = 1 + !$omp end parallel + print *, "sum of thread numbers is ", my_var +end subroutine declare_red_1 end diff --git a/flang/test/Semantics/OpenMP/declare-reduction.f90 b/flang/test/Semantics/OpenMP/declare-reduction.f90 new file mode 100644 index 0000000000000..8fee79dfc0b7b --- /dev/null +++ b/flang/test/Semantics/OpenMP/declare-reduction.f90 @@ -0,0 +1,11 @@ +! RUN: %flang_fc1 -fdebug-dump-symbols -fopenmp -fopenmp-version=50 %s | FileCheck %s + +program main +!CHECK-LABEL: MainProgram scope: main + + !$omp declare reduction (my_add_red : integer : omp_out = omp_out + omp_in) initializer (omp_priv=0) + +!CHECK: my_add_red: Misc ConstructName + +end program main + diff --git a/libc/docs/gpu/using.rst b/libc/docs/gpu/using.rst index 1c1f9c9bfb0c6..f17f6287be313 100644 --- a/libc/docs/gpu/using.rst +++ b/libc/docs/gpu/using.rst @@ -44,7 +44,7 @@ this shouldn't be necessary. $> clang openmp.c -fopenmp --offload-arch=gfx90a -Xoffload-linker -lc $> clang cuda.cu --offload-arch=sm_80 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc - $> clang hip.hip --offload-arch=gfx940 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc + $> clang hip.hip --offload-arch=gfx942 --offload-new-driver -fgpu-rdc -Xoffload-linker -lc This will automatically link in the needed function definitions if they were required by the user's application. Normally using the ``-fgpu-rdc`` option diff --git a/libc/include/__llvm-libc-common.h b/libc/include/__llvm-libc-common.h index 212e3c6a9446c..c6fd33a55532c 100644 --- a/libc/include/__llvm-libc-common.h +++ b/libc/include/__llvm-libc-common.h @@ -47,6 +47,11 @@ #define __NOEXCEPT throw() #endif +// This macro serves as a generic cast implementation for use in both C and C++, +// similar to `__BIONIC_CAST` in Android. +#undef __LLVM_LIBC_CAST +#define __LLVM_LIBC_CAST(cast, type, value) (cast(value)) + #else // not __cplusplus #undef __BEGIN_C_DECLS @@ -85,6 +90,9 @@ #undef _Returns_twice #define _Returns_twice __attribute__((returns_twice)) +#undef __LLVM_LIBC_CAST +#define __LLVM_LIBC_CAST(cast, type, value) ((type)(value)) + #endif // __cplusplus #endif // _LLVM_LIBC_COMMON_H diff --git a/libc/include/llvm-libc-macros/endian-macros.h b/libc/include/llvm-libc-macros/endian-macros.h index e1e105d50c1c6..52d95dc01cd83 100644 --- a/libc/include/llvm-libc-macros/endian-macros.h +++ b/libc/include/llvm-libc-macros/endian-macros.h @@ -20,27 +20,27 @@ #define htobe16(x) __builtin_bswap16((x)) #define htobe32(x) __builtin_bswap32((x)) #define htobe64(x) __builtin_bswap64((x)) -#define htole16(x) ((uint16_t)(x)) -#define htole32(x) ((uint32_t)(x)) -#define htole64(x) ((uint64_t)(x)) +#define htole16(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define htole32(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define htole64(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define be16toh(x) __builtin_bswap16((x)) #define be32toh(x) __builtin_bswap32((x)) #define be64toh(x) __builtin_bswap64((x)) -#define le16toh(x) ((uint16_t)(x)) -#define le32toh(x) ((uint32_t)(x)) -#define le64toh(x) ((uint64_t)(x)) +#define le16toh(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define le32toh(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define le64toh(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #else -#define htobe16(x) ((uint16_t)(x)) -#define htobe32(x) ((uint32_t)(x)) -#define htobe64(x) ((uint64_t)(x)) +#define htobe16(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define htobe32(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define htobe64(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define htole16(x) __builtin_bswap16((x)) #define htole32(x) __builtin_bswap32((x)) #define htole64(x) __builtin_bswap64((x)) -#define be16toh(x) ((uint16_t)(x)) -#define be32toh(x) ((uint32_t)(x)) -#define be64toh(x) ((uint64_t)(x)) +#define be16toh(x) __LLVM_LIBC_CAST(static_cast, uint16_t, x) +#define be32toh(x) __LLVM_LIBC_CAST(static_cast, uint32_t, x) +#define be64toh(x) __LLVM_LIBC_CAST(static_cast, uint64_t, x) #define le16toh(x) __builtin_bswap16((x)) #define le32toh(x) __builtin_bswap32((x)) #define le64toh(x) __builtin_bswap64((x)) diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt index 5cefa8a264310..5a9a26c44f368 100644 --- a/libclc/CMakeLists.txt +++ b/libclc/CMakeLists.txt @@ -20,16 +20,12 @@ include( GNUInstallDirs ) set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS amdgcn-amdhsa/lib/SOURCES; amdgcn/lib/SOURCES; - amdgcn-mesa3d/lib/SOURCES; amdgpu/lib/SOURCES; clspv/lib/SOURCES; - clspv64/lib/SOURCES; generic/lib/SOURCES; - ptx/lib/SOURCES; ptx-nvidiacl/lib/SOURCES; r600/lib/SOURCES; spirv/lib/SOURCES; - spirv64/lib/SOURCES; # CLC internal libraries clc/lib/generic/SOURCES; ) @@ -211,7 +207,7 @@ set( cayman_aliases aruba ) set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11 gfx602 gfx705 gfx805 - gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942 + gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx942 gfx1010 gfx1011 gfx1012 gfx1013 gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036 gfx1100 gfx1101 gfx1102 gfx1103 @@ -280,11 +276,6 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( opencl_dirs ) - if ( NOT ${ARCH} STREQUAL spirv AND NOT ${ARCH} STREQUAL spirv64 AND - NOT ${ARCH} STREQUAL clspv AND NOT ${ARCH} STREQUAL clspv64) - LIST( APPEND opencl_dirs generic ) - endif() - if( ${ARCH} STREQUAL r600 OR ${ARCH} STREQUAL amdgcn ) list( APPEND opencl_dirs amdgpu ) endif() @@ -302,8 +293,25 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) set( DARCH ${ARCH} ) endif() + # Append a variety of target- and triple-based directories to search, + # increasing in specificity. + list( APPEND opencl_dirs ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} ) + + # The 'generic' directory contains all of the generic implementations of the + # builtins. It is included first so it has the lowest search priority, + # allowing targets to override builtins based on file names found later in + # the list of search directories. + # CLC builds all builtins for all targets, so unconditionally prepend the + # 'generic' directory. + set( clc_dirs generic ${opencl_dirs} ) + # Some OpenCL targets don't build all builtins, in which case they don't want + # the 'generic' directory. Otherwise, prepend the 'generic' directory. + if ( NOT ARCH STREQUAL spirv AND NOT ARCH STREQUAL spirv64 AND + NOT ARCH STREQUAL clspv AND NOT ARCH STREQUAL clspv64) + list( PREPEND opencl_dirs generic ) + endif() + set( clc_lib_files ) - set( clc_dirs ${dirs} generic ) if( ARCH STREQUAL clspv OR ARCH STREQUAL clspv64 ) set( clc_gen_files clc-clspv-convert.cl ) @@ -315,7 +323,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) clc_lib_files CLC_INTERNAL LIB_ROOT_DIR clc - DIRS ${clc_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${clc_dirs} ) set( opencl_lib_files ) @@ -334,7 +342,7 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} ) libclc_configure_lib_source( opencl_lib_files - DIRS ${opencl_dirs} ${DARCH} ${DARCH}-${OS} ${DARCH}-${VENDOR}-${OS} + DIRS ${opencl_dirs} ) foreach( d ${${t}_devices} ) diff --git a/libclc/clc/lib/generic/math/clc_nextafter.cl b/libclc/clc/lib/generic/math/clc_nextafter.cl index 58125485bf684..f77f3647d5985 100644 --- a/libclc/clc/lib/generic/math/clc_nextafter.cl +++ b/libclc/clc/lib/generic/math/clc_nextafter.cl @@ -1,5 +1,6 @@ #include #include +#include #include // This file provides OpenCL C implementations of __clc_nextafter for @@ -12,21 +13,30 @@ FLOAT_TYPE y) { \ const UINT_TYPE sign_bit = (UINT_TYPE)1 \ << (sizeof(INT_TYPE_SCALAR) * 8 - 1); \ - const UINT_TYPE sign_bit_mask = sign_bit - (UINT_TYPE)1; \ - INT_TYPE ix = CLC_AS_TYPE(INT_TYPE)(x); \ - UINT_TYPE ax = CLC_AS_TYPE(UINT_TYPE)(ix) & sign_bit_mask; \ - INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(sign_bit) - ix; \ - mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx : ix; \ - INT_TYPE iy = CLC_AS_TYPE(INT_TYPE)(y); \ - UINT_TYPE ay = CLC_AS_TYPE(UINT_TYPE)(iy) & sign_bit_mask; \ - INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(sign_bit) - iy; \ - my = iy < (INT_TYPE)0 ? my : iy; \ + UINT_TYPE ix = CLC_AS_TYPE(UINT_TYPE)(x); \ + FLOAT_TYPE absx = __clc_fabs(x); \ + UINT_TYPE mxu = sign_bit - ix; \ + INT_TYPE mx = CLC_AS_TYPE(INT_TYPE)(mxu); \ + mx = CLC_AS_TYPE(INT_TYPE)(ix) < (INT_TYPE)0 ? mx \ + : CLC_AS_TYPE(INT_TYPE)(ix); \ + UINT_TYPE iy = CLC_AS_TYPE(UINT_TYPE)(y); \ + FLOAT_TYPE absy = __clc_fabs(y); \ + UINT_TYPE myu = sign_bit - iy; \ + INT_TYPE my = CLC_AS_TYPE(INT_TYPE)(myu); \ + my = CLC_AS_TYPE(INT_TYPE)(iy) < (INT_TYPE)0 ? my \ + : CLC_AS_TYPE(INT_TYPE)(iy); \ INT_TYPE t = mx + (mx < my ? (INT_TYPE)1 : (INT_TYPE)-1); \ - INT_TYPE r = CLC_AS_TYPE(INT_TYPE)(sign_bit) - t; \ - r = t < (INT_TYPE)0 ? r : t; \ + UINT_TYPE r = sign_bit - CLC_AS_TYPE(UINT_TYPE)(t); \ + r = (t < (INT_TYPE)0 || (t == (INT_TYPE)0 && mx < my)) \ + ? r \ + : CLC_AS_TYPE(UINT_TYPE)(t); \ r = __clc_isnan(x) ? ix : r; \ - r = __clc_isnan(y) ? CLC_AS_TYPE(INT_TYPE)(iy) : r; \ - r = ((ax | ay) == (UINT_TYPE)0 || ix == iy) ? iy : r; \ + r = __clc_isnan(y) ? iy : r; \ + r = ((CLC_AS_TYPE(UINT_TYPE)(absx) | CLC_AS_TYPE(UINT_TYPE)(absy)) == \ + (UINT_TYPE)0 || \ + ix == iy) \ + ? iy \ + : r; \ return CLC_AS_TYPE(FLOAT_TYPE)(r); \ } diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake index 40e31e0ba4f45..911559ff4bfa9 100644 --- a/libclc/cmake/modules/AddLibclc.cmake +++ b/libclc/cmake/modules/AddLibclc.cmake @@ -402,7 +402,8 @@ endfunction(add_libclc_builtin_set) # directory. If not provided, is set to '.'. # * DIRS ... # List of directories under LIB_ROOT_DIR to walk over searching for SOURCES -# files +# files. Directories earlier in the list have lower priority than +# subsequent ones. function(libclc_configure_lib_source LIB_FILE_LIST) cmake_parse_arguments(ARG "CLC_INTERNAL" @@ -417,7 +418,7 @@ function(libclc_configure_lib_source LIB_FILE_LIST) # Enumerate SOURCES* files set( source_list ) - foreach( l ${ARG_DIRS} ) + foreach( l IN LISTS ARG_DIRS ) foreach( s "SOURCES" "SOURCES_${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}" ) if( ARG_CLC_INTERNAL ) file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/lib/${l}/${s} file_loc ) @@ -425,10 +426,10 @@ function(libclc_configure_lib_source LIB_FILE_LIST) file( TO_CMAKE_PATH ${ARG_LIB_ROOT_DIR}/${l}/lib/${s} file_loc ) endif() file( TO_CMAKE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${file_loc} loc ) - # Prepend the location to give higher priority to - # specialized implementation + # Prepend the location to give higher priority to the specialized + # implementation if( EXISTS ${loc} ) - set( source_list ${file_loc} ${source_list} ) + list( PREPEND source_list ${file_loc} ) endif() endforeach() endforeach() diff --git a/libclc/generic/lib/math/ep_log.cl b/libclc/generic/lib/math/ep_log.cl index 90c9fa426fec1..f0b5d3fdfbb1c 100644 --- a/libclc/generic/lib/math/ep_log.cl +++ b/libclc/generic/lib/math/ep_log.cl @@ -38,57 +38,57 @@ #define LF1 1.24999999978138668903e-02 #define LF2 2.23219810758559851206e-03 -_CLC_DEF void __clc_ep_log(double x, int *xexp, double *r1, double *r2) -{ - // Computes natural log(x). Algorithm based on: - // Ping-Tak Peter Tang - // "Table-driven implementation of the logarithm function in IEEE - // floating-point arithmetic" - // ACM Transactions on Mathematical Software (TOMS) - // Volume 16, Issue 4 (December 1990) - int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; - - ulong ux = as_ulong(x); - ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); - int c = ux < IMPBIT_DP64; - ux = c ? uxs : ux; - int expadjust = c ? 60 : 0; - - // Store the exponent of x in xexp and put f into the range [0.5,1) - int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; - double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); - *xexp = near_one ? 0 : xexp1; - - double r = x - 1.0; - double u1 = MATH_DIVIDE(r, 2.0 + r); - double ru1 = -r * u1; - u1 = u1 + u1; - - int index = as_int2(ux).hi >> 13; - index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); - - double f1 = index * 0x1.0p-7; - double f2 = f - f1; - double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); - - double2 tv = USE_TABLE(ln_tbl, (index - 64)); - double z1 = tv.s0; - double q = tv.s1; - - z1 = near_one ? r : z1; - q = near_one ? 0.0 : q; - double u = near_one ? u1 : u2; - double v = u*u; - - double cc = near_one ? ru1 : u2; - - double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); - double z22 = fma(v, fma(v, LF2, LF1), LF0); - double z2 = near_one ? z21 : z22; - z2 = fma(u*v, z2, cc) + q; - - *r1 = z1; - *r2 = z2; +_CLC_DEF void __clc_ep_log(double x, private int *xexp, private double *r1, + private double *r2) { + // Computes natural log(x). Algorithm based on: + // Ping-Tak Peter Tang + // "Table-driven implementation of the logarithm function in IEEE + // floating-point arithmetic" + // ACM Transactions on Mathematical Software (TOMS) + // Volume 16, Issue 4 (December 1990) + int near_one = x >= 0x1.e0faap-1 & x <= 0x1.1082cp+0; + + ulong ux = as_ulong(x); + ulong uxs = as_ulong(as_double(0x03d0000000000000UL | ux) - 0x1.0p-962); + int c = ux < IMPBIT_DP64; + ux = c ? uxs : ux; + int expadjust = c ? 60 : 0; + + // Store the exponent of x in xexp and put f into the range [0.5,1) + int xexp1 = ((as_int2(ux).hi >> 20) & 0x7ff) - EXPBIAS_DP64 - expadjust; + double f = as_double(HALFEXPBITS_DP64 | (ux & MANTBITS_DP64)); + *xexp = near_one ? 0 : xexp1; + + double r = x - 1.0; + double u1 = MATH_DIVIDE(r, 2.0 + r); + double ru1 = -r * u1; + u1 = u1 + u1; + + int index = as_int2(ux).hi >> 13; + index = ((0x80 | (index & 0x7e)) >> 1) + (index & 0x1); + + double f1 = index * 0x1.0p-7; + double f2 = f - f1; + double u2 = MATH_DIVIDE(f2, fma(0.5, f2, f1)); + + double2 tv = USE_TABLE(ln_tbl, (index - 64)); + double z1 = tv.s0; + double q = tv.s1; + + z1 = near_one ? r : z1; + q = near_one ? 0.0 : q; + double u = near_one ? u1 : u2; + double v = u * u; + + double cc = near_one ? ru1 : u2; + + double z21 = fma(v, fma(v, fma(v, LN3, LN2), LN1), LN0); + double z22 = fma(v, fma(v, LF2, LF1), LF0); + double z2 = near_one ? z21 : z22; + z2 = fma(u * v, z2, cc) + q; + + *r1 = z1; + *r2 = z2; } #endif diff --git a/libclc/generic/lib/math/ep_log.h b/libclc/generic/lib/math/ep_log.h index 414e6231f7fd6..3176cfe5b42ce 100644 --- a/libclc/generic/lib/math/ep_log.h +++ b/libclc/generic/lib/math/ep_log.h @@ -26,6 +26,7 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DECL void __clc_ep_log(double x, int *xexp, double *r1, double *r2); +_CLC_DECL void __clc_ep_log(double x, private int *xexp, private double *r1, + private double *r2); #endif diff --git a/libclc/generic/lib/math/modf.inc b/libclc/generic/lib/math/modf.inc index 1ffc6d9e851bd..ff7ef30dd42f8 100644 --- a/libclc/generic/lib/math/modf.inc +++ b/libclc/generic/lib/math/modf.inc @@ -28,18 +28,20 @@ #define ZERO 0.0h #endif -_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, __CLC_GENTYPE *iptr) { +_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, + private __CLC_GENTYPE *iptr) { *iptr = trunc(x); return copysign(isinf(x) ? ZERO : x - *iptr, x); } -#define MODF_DEF(addrspace) \ - _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, addrspace __CLC_GENTYPE *iptr) { \ - __CLC_GENTYPE private_iptr; \ - __CLC_GENTYPE ret = modf(x, &private_iptr); \ - *iptr = private_iptr; \ - return ret; \ -} +#define MODF_DEF(addrspace) \ + _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE modf(__CLC_GENTYPE x, \ + addrspace __CLC_GENTYPE *iptr) { \ + __CLC_GENTYPE private_iptr; \ + __CLC_GENTYPE ret = modf(x, &private_iptr); \ + *iptr = private_iptr; \ + return ret; \ + } MODF_DEF(local); MODF_DEF(global); diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl index 22f2bf61bf27d..441bad2be432f 100644 --- a/libclc/generic/lib/math/sincos_helpers.cl +++ b/libclc/generic/lib/math/sincos_helpers.cl @@ -119,8 +119,8 @@ _CLC_DEF float __clc_tanf_piby4(float x, int regn) { return regn & 1 ? tr : t; } -_CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, - float bt) { +_CLC_DEF void __clc_fullMulS(private float *hi, private float *lo, float a, + float b, float bh, float bt) { if (HAVE_HW_FMA32()) { float ph = a * b; *hi = ph; @@ -136,7 +136,7 @@ _CLC_DEF void __clc_fullMulS(float *hi, float *lo, float a, float b, float bh, } } -_CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) { +_CLC_DEF float __clc_removePi2S(private float *hi, private float *lo, float x) { // 72 bits of pi/2 const float fpiby2_1 = (float)0xC90FDA / 0x1.0p+23f; const float fpiby2_1_h = (float)0xC90 / 0x1.0p+11f; @@ -174,7 +174,8 @@ _CLC_DEF float __clc_removePi2S(float *hi, float *lo, float x) { return fnpi2; } -_CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionSmallS(private float *r, private float *rr, + float x) { float fnpi2 = __clc_removePi2S(r, rr, x); return (int)fnpi2 & 0x3; } @@ -188,7 +189,8 @@ _CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x) { HI = __clc_mul_hi(A, B); \ HI += LO < C -_CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionLargeS(private float *r, private float *rr, + float x) { int xe = (int)(as_uint(x) >> 23) - 127; uint xm = 0x00800000U | (as_uint(x) & 0x7fffffU); @@ -330,7 +332,7 @@ _CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x) { return ((i >> 1) + (i & 1)) & 0x3; } -_CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) { +_CLC_DEF int __clc_argReductionS(private float *r, private float *rr, float x) { if (x < 0x1.0p+23f) return __clc_argReductionSmallS(r, rr, x); else @@ -342,8 +344,9 @@ _CLC_DEF int __clc_argReductionS(float *r, float *rr, float x) { #pragma OPENCL EXTENSION cl_khr_fp64 : enable // Reduction for medium sized arguments -_CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, - int *regn) { +_CLC_DEF void __clc_remainder_piby2_medium(double x, private double *r, + private double *rr, + private int *regn) { // How many pi/2 is x a multiple of? const double two_by_pi = 0x1.45f306dc9c883p-1; double dnpi2 = __clc_trunc(fma(x, two_by_pi, 0.5)); @@ -387,8 +390,9 @@ _CLC_DEF void __clc_remainder_piby2_medium(double x, double *r, double *rr, // Return value "regn" tells how many lots of pi/2 were subtracted // from x to put it in the range [-pi/4,pi/4], mod 4. -_CLC_DEF void __clc_remainder_piby2_large(double x, double *r, double *rr, - int *regn) { +_CLC_DEF void __clc_remainder_piby2_large(double x, private double *r, + private double *rr, + private int *regn) { long ux = as_long(x); int e = (int)(ux >> 52) - 1023; diff --git a/libclc/generic/lib/math/sincos_helpers.h b/libclc/generic/lib/math/sincos_helpers.h index 6dbca73aa2a2e..c7981e5278f2a 100644 --- a/libclc/generic/lib/math/sincos_helpers.h +++ b/libclc/generic/lib/math/sincos_helpers.h @@ -26,16 +26,18 @@ _CLC_DECL float __clc_sinf_piby4(float x, float y); _CLC_DECL float __clc_cosf_piby4(float x, float y); _CLC_DECL float __clc_tanf_piby4(float x, int y); -_CLC_DECL int __clc_argReductionS(float *r, float *rr, float x); +_CLC_DECL int __clc_argReductionS(private float *r, private float *rr, float x); #ifdef cl_khr_fp64 #pragma OPENCL EXTENSION cl_khr_fp64 : enable -_CLC_DECL void __clc_remainder_piby2_medium(double x, double *r, double *rr, - int *regn); -_CLC_DECL void __clc_remainder_piby2_large(double x, double *r, double *rr, - int *regn); +_CLC_DECL void __clc_remainder_piby2_medium(double x, private double *r, + private double *rr, + private int *regn); +_CLC_DECL void __clc_remainder_piby2_large(double x, private double *r, + private double *rr, + private int *regn); _CLC_DECL double2 __clc_sincos_piby4(double x, double xx); #endif diff --git a/libcxx/docs/ReleaseNotes/21.rst b/libcxx/docs/ReleaseNotes/21.rst index 2439360797023..88a0666611a9a 100644 --- a/libcxx/docs/ReleaseNotes/21.rst +++ b/libcxx/docs/ReleaseNotes/21.rst @@ -43,8 +43,8 @@ Implemented Papers Improvements and New Features ----------------------------- -- The ``std::ranges::{copy, copy_n, copy_backward}`` algorithms have been optimized for ``std::vector::iterator``\s, - resulting in a performance improvement of up to 2000x. +- The ``std::ranges::{copy, copy_n, copy_backward, move, move_backward}`` algorithms have been optimized for + ``std::vector::iterator``, resulting in a performance improvement of up to 2000x. - Updated formatting library to Unicode 16.0.0. diff --git a/libcxx/include/__algorithm/move.h b/libcxx/include/__algorithm/move.h index 6f3b0eb5d2927..a3320e9f1985d 100644 --- a/libcxx/include/__algorithm/move.h +++ b/libcxx/include/__algorithm/move.h @@ -9,11 +9,13 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_H #define _LIBCPP___ALGORITHM_MOVE_H +#include <__algorithm/copy.h> #include <__algorithm/copy_move_common.h> #include <__algorithm/for_each_segment.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__type_traits/common_type.h> @@ -98,6 +100,14 @@ struct __move_impl { } } + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> > + operator()(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { + return std::__copy(__first, __last, __result); + } + // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> diff --git a/libcxx/include/__algorithm/move_backward.h b/libcxx/include/__algorithm/move_backward.h index 24a8d9b24527a..14482fee18114 100644 --- a/libcxx/include/__algorithm/move_backward.h +++ b/libcxx/include/__algorithm/move_backward.h @@ -9,10 +9,12 @@ #ifndef _LIBCPP___ALGORITHM_MOVE_BACKWARD_H #define _LIBCPP___ALGORITHM_MOVE_BACKWARD_H +#include <__algorithm/copy_backward.h> #include <__algorithm/copy_move_common.h> #include <__algorithm/iterator_operations.h> #include <__algorithm/min.h> #include <__config> +#include <__fwd/bit_reference.h> #include <__iterator/iterator_traits.h> #include <__iterator/segmented_iterator.h> #include <__type_traits/common_type.h> @@ -107,6 +109,14 @@ struct __move_backward_impl { } } + template + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 pair<__bit_iterator<_Cp, _IsConst>, __bit_iterator<_Cp, false> > + operator()(__bit_iterator<_Cp, _IsConst> __first, + __bit_iterator<_Cp, _IsConst> __last, + __bit_iterator<_Cp, false> __result) { + return std::__copy_backward<_ClassicAlgPolicy>(__first, __last, __result); + } + // At this point, the iterators have been unwrapped so any `contiguous_iterator` has been unwrapped to a pointer. template ::value, int> = 0> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 pair<_In*, _Out*> diff --git a/libcxx/include/__bit_reference b/libcxx/include/__bit_reference index aad470394732c..377f5fed12266 100644 --- a/libcxx/include/__bit_reference +++ b/libcxx/include/__bit_reference @@ -210,22 +210,6 @@ private: __mask_(__m) {} }; -// move - -template -inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> -move(__bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - return std::copy(__first, __last, __result); -} - -// move_backward - -template -inline _LIBCPP_HIDE_FROM_ABI __bit_iterator<_Cp, false> move_backward( - __bit_iterator<_Cp, _IsConst> __first, __bit_iterator<_Cp, _IsConst> __last, __bit_iterator<_Cp, false> __result) { - return std::copy_backward(__first, __last, __result); -} - // swap_ranges template diff --git a/libcxx/include/__configuration/platform.h b/libcxx/include/__configuration/platform.h index cff99376ee24b..8d0f8f63f5213 100644 --- a/libcxx/include/__configuration/platform.h +++ b/libcxx/include/__configuration/platform.h @@ -32,12 +32,14 @@ // Need to detect which libc we're using if we're on Linux. #if defined(__linux__) || defined(__AMDGPU__) || defined(__NVPTX__) -# include -# if defined(__GLIBC_PREREQ) -# define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) -# else -# define _LIBCPP_GLIBC_PREREQ(a, b) 0 -# endif // defined(__GLIBC_PREREQ) +# if __has_include() +# include +# if defined(__GLIBC_PREREQ) +# define _LIBCPP_GLIBC_PREREQ(a, b) __GLIBC_PREREQ(a, b) +# else +# define _LIBCPP_GLIBC_PREREQ(a, b) 0 +# endif // defined(__GLIBC_PREREQ) +# endif #endif #ifndef __BYTE_ORDER__ diff --git a/libcxx/include/__variant/monostate.h b/libcxx/include/__variant/monostate.h index c5d2dacaf4205..b29bbdf5cdbe4 100644 --- a/libcxx/include/__variant/monostate.h +++ b/libcxx/include/__variant/monostate.h @@ -49,10 +49,12 @@ _LIBCPP_HIDE_FROM_ABI inline constexpr bool operator>=(monostate, monostate) noe template <> struct _LIBCPP_TEMPLATE_VIS hash { - using argument_type = monostate; - using result_type = size_t; +# if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS) + using argument_type _LIBCPP_DEPRECATED_IN_CXX17 = monostate; + using result_type _LIBCPP_DEPRECATED_IN_CXX17 = size_t; +# endif - inline _LIBCPP_HIDE_FROM_ABI result_type operator()(const argument_type&) const _NOEXCEPT { + inline _LIBCPP_HIDE_FROM_ABI size_t operator()(const monostate&) const noexcept { return 66740831; // return a fundamentally attractive random value. } }; diff --git a/libcxx/include/string b/libcxx/include/string index 396e73522d3e7..3f43e8fd8d586 100644 --- a/libcxx/include/string +++ b/libcxx/include/string @@ -691,50 +691,11 @@ _LIBCPP_BEGIN_NAMESPACE_STD // basic_string -template -basic_string<_CharT, _Traits, _Allocator> _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -_LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const _CharT* __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(_CharT __x, const basic_string<_CharT, _Traits, _Allocator>& __y); - -template -inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, const _CharT* __y); - template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __x, _CharT __y); - -# if _LIBCPP_STD_VER >= 26 - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - type_identity_t> __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t> __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs); - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs); - -# endif - -extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ - , allocator >(char const*, string const&); +__concatenate_strings(const _Allocator& __alloc, + __type_identity_t > __str1, + __type_identity_t > __str2); template struct __string_is_trivial_iterator : public false_type {}; @@ -2425,15 +2386,8 @@ private: std::__throw_out_of_range("basic_string"); } - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const value_type*, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(value_type, const basic_string&); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, const value_type*); - friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string operator+ <>(const basic_string&, value_type); -# if _LIBCPP_STD_VER >= 26 - friend constexpr basic_string operator+ <>(const basic_string&, type_identity_t<__self_view>); - friend constexpr basic_string operator+ <>(type_identity_t<__self_view>, const basic_string&); -# endif + friend _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string + __concatenate_strings<>(const _Allocator&, __type_identity_t<__self_view>, __type_identity_t<__self_view>); template friend inline _LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI bool @@ -3815,83 +3769,73 @@ operator>=(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs) { +__concatenate_strings(const _Allocator& __alloc, + __type_identity_t > __str1, + __type_identity_t > __str2) { using _String = basic_string<_CharT, _Traits, _Allocator>; - auto __lhs_sz = __lhs.size(); - auto __rhs_sz = __rhs.size(); _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); + __str1.size() + __str2.size(), + _String::__alloc_traits::select_on_container_copy_construction(__alloc)); auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); + _Traits::copy(__ptr, __str1.data(), __str1.size()); + _Traits::copy(__ptr + __str1.size(), __str2.data(), __str2.size()); + _Traits::assign(__ptr[__str1.size() + __str2.size()], _CharT()); return __r; } +template +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); +} + template _LIBCPP_HIDDEN _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const _CharT* __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - auto __lhs_sz = _Traits::length(__lhs); - auto __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs, __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>(__rhs.get_allocator(), __lhs, __rhs); } +extern template _LIBCPP_EXPORTED_FROM_ABI string operator+ + , allocator >(char const*, string const&); + template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(_CharT __lhs, const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __rhs_sz + 1, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::assign(__ptr, 1, __lhs); - _Traits::copy(__ptr + 1, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + 1 + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>( + __rhs.get_allocator(), basic_string_view<_CharT, _Traits>(&__lhs, 1), __rhs); } template -inline _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> +_LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, const _CharT* __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = _Traits::length(__rhs); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs, __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); } template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 basic_string<_CharT, _Traits, _Allocator> operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, _CharT __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + 1, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::assign(__ptr + __lhs_sz, 1, __rhs); - _Traits::assign(__ptr + 1 + __lhs_sz, 1, _CharT()); - return __r; + return std::__concatenate_strings<_CharT, _Traits>( + __lhs.get_allocator(), __lhs, basic_string_view<_CharT, _Traits>(&__rhs, 1)); +} +# if _LIBCPP_STD_VER >= 26 + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, + type_identity_t> __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__lhs.get_allocator(), __lhs, __rhs); +} + +template +_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> +operator+(type_identity_t> __lhs, + const basic_string<_CharT, _Traits, _Allocator>& __rhs) { + return std::__concatenate_strings<_CharT, _Traits>(__rhs.get_allocator(), __lhs, __rhs); } +# endif // _LIBCPP_STD_VER >= 26 + # ifndef _LIBCPP_CXX03_LANG template @@ -3942,54 +3886,18 @@ operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, _CharT __rhs) { # if _LIBCPP_STD_VER >= 26 -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(const basic_string<_CharT, _Traits, _Allocator>& __lhs, - type_identity_t> __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__lhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; -} - template _LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> operator+(basic_string<_CharT, _Traits, _Allocator>&& __lhs, type_identity_t> __rhs) { - __lhs.append(__rhs); - return std::move(__lhs); -} - -template -_LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> -operator+(type_identity_t> __lhs, - const basic_string<_CharT, _Traits, _Allocator>& __rhs) { - using _String = basic_string<_CharT, _Traits, _Allocator>; - typename _String::size_type __lhs_sz = __lhs.size(); - typename _String::size_type __rhs_sz = __rhs.size(); - _String __r(__uninitialized_size_tag(), - __lhs_sz + __rhs_sz, - _String::__alloc_traits::select_on_container_copy_construction(__rhs.get_allocator())); - auto __ptr = std::__to_address(__r.__get_pointer()); - _Traits::copy(__ptr, __lhs.data(), __lhs_sz); - _Traits::copy(__ptr + __lhs_sz, __rhs.data(), __rhs_sz); - _Traits::assign(__ptr + __lhs_sz + __rhs_sz, 1, _CharT()); - return __r; + return std::move(__lhs.append(__rhs)); } template _LIBCPP_HIDE_FROM_ABI constexpr basic_string<_CharT, _Traits, _Allocator> operator+(type_identity_t> __lhs, basic_string<_CharT, _Traits, _Allocator>&& __rhs) { - __rhs.insert(0, __lhs); - return std::move(__rhs); + return std::move(__rhs.insert(0, __lhs)); } # endif // _LIBCPP_STD_VER >= 26 diff --git a/libcxx/include/variant b/libcxx/include/variant index 3786d9524020b..9998d4a457715 100644 --- a/libcxx/include/variant +++ b/libcxx/include/variant @@ -1585,10 +1585,12 @@ swap(variant<_Types...>& __lhs, template struct _LIBCPP_TEMPLATE_VIS hash< __enable_hash_helper, remove_const_t<_Types>...>> { - using argument_type = variant<_Types...>; - using result_type = size_t; +# if _LIBCPP_STD_VER <= 17 || defined(_LIBCPP_ENABLE_CXX20_REMOVED_BINDER_TYPEDEFS) + using argument_type _LIBCPP_DEPRECATED_IN_CXX17 = variant<_Types...>; + using result_type _LIBCPP_DEPRECATED_IN_CXX17 = size_t; +# endif - _LIBCPP_HIDE_FROM_ABI result_type operator()(const argument_type& __v) const { + _LIBCPP_HIDE_FROM_ABI size_t operator()(const variant<_Types...>& __v) const { using __variant_detail::__visitation::__variant; size_t __res = __v.valueless_by_exception() diff --git a/libcxx/test/benchmarks/algorithms/move.bench.cpp b/libcxx/test/benchmarks/algorithms/move.bench.cpp new file mode 100644 index 0000000000000..73f36f0c129de --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/move.bench.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include + +template +void bm_ranges_move_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + if constexpr (aligned) { + benchmark::DoNotOptimize(std::ranges::move(*in, std::ranges::begin(*out))); + } else { + benchmark::DoNotOptimize( + std::ranges::move(std::views::counted(in->begin() + 4, n - 4), std::ranges::begin(*out))); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +template +void bm_move_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + auto first1 = in->begin(); + auto last1 = in->end(); + auto first2 = out->begin(); + if constexpr (aligned) { + benchmark::DoNotOptimize(std::move(first1, last1, first2)); + } else { + benchmark::DoNotOptimize(std::move(first1 + 4, last1, first2)); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(bm_ranges_move_vb) + ->Name("bm_ranges_move_vb_aligned") + ->Range(8, 1 << 16) + ->DenseRange(102400, 204800, 4096); +BENCHMARK(bm_ranges_move_vb)->Name("bm_ranges_move_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK(bm_move_vb)->Name("bm_move_vb_aligned")->Range(8, 1 << 20); +BENCHMARK(bm_move_vb)->Name("bm_move_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp b/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp new file mode 100644 index 0000000000000..23d7395198419 --- /dev/null +++ b/libcxx/test/benchmarks/algorithms/move_backward.bench.cpp @@ -0,0 +1,71 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 + +#include +#include +#include +#include + +template +void bm_ranges_move_backward_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + if constexpr (aligned) { + benchmark::DoNotOptimize(std::ranges::move_backward(*in, std::ranges::end(*out))); + } else { + benchmark::DoNotOptimize( + std::ranges::move_backward(std::views::counted(in->begin(), n - 4), std::ranges::end(*out))); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +template +void bm_move_backward_vb(benchmark::State& state) { + auto n = state.range(); + std::vector v1(n, true); + std::vector v2(n, false); + benchmark::DoNotOptimize(v1); + benchmark::DoNotOptimize(v2); + std::vector* in = &v1; + std::vector* out = &v2; + for (auto _ : state) { + auto first1 = in->begin(); + auto last1 = in->end(); + auto last2 = out->end(); + if constexpr (aligned) { + benchmark::DoNotOptimize(std::move_backward(first1, last1, last2)); + } else { + benchmark::DoNotOptimize(std::move_backward(first1, last1 - 4, last2)); + } + std::swap(in, out); + benchmark::DoNotOptimize(in); + benchmark::DoNotOptimize(out); + } +} + +BENCHMARK(bm_ranges_move_backward_vb) + ->Name("bm_ranges_move_backward_vb_aligned") + ->Range(8, 1 << 16) + ->DenseRange(102400, 204800, 4096); +BENCHMARK(bm_ranges_move_backward_vb)->Name("bm_ranges_move_backward_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK(bm_move_backward_vb)->Name("bm_move_backward_vb_aligned")->Range(8, 1 << 20); +BENCHMARK(bm_move_backward_vb)->Name("bm_move_backward_vb_unaligned")->Range(8, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp index 1ca397c92a334..3d4ee23a5a7ff 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy.pass.cpp @@ -18,6 +18,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -81,7 +82,7 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } TEST_CONSTEXPR_CXX20 bool test() { - types::for_each(types::cpp17_input_iterator_list(), TestInIters()); + types::for_each(types::cpp17_input_iterator_list(), TestInIters()); { // Make sure that padding bits aren't copied Derived src(1, 2, 3); @@ -91,7 +92,6 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::copy(a + 3, a + 10, a); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp index 445c7718e1111..8a528a96f5294 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_backward.pass.cpp @@ -19,6 +19,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" class PaddedBase { @@ -36,21 +37,29 @@ class Derived : public PaddedBase { std::int8_t c_; }; -template -TEST_CONSTEXPR_CXX20 void test_copy_backward() { - { - const unsigned N = 1000; - int ia[N] = {}; - for (unsigned i = 0; i < N; ++i) - ia[i] = i; - int ib[N] = {0}; - - OutIter r = std::copy_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); - assert(base(r) == ib); - for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); +struct TestIterators { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each(types::bidirectional_iterator_list(), TestImpl()); } -} + + template + struct TestImpl { + template + TEST_CONSTEXPR_CXX20 void operator()() { + const unsigned N = 1000; + int ia[N] = {}; + for (unsigned i = 0; i < N; ++i) + ia[i] = i; + int ib[N] = {0}; + + OutIter r = std::copy_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); + assert(base(r) == ib); + for (unsigned i = 0; i < N; ++i) + assert(ia[i] == ib[i]); + } + }; +}; TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -70,31 +79,10 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } return true; -}; +} TEST_CONSTEXPR_CXX20 bool test() { - test_copy_backward, bidirectional_iterator >(); - test_copy_backward, random_access_iterator >(); - test_copy_backward, int*>(); - - test_copy_backward, bidirectional_iterator >(); - test_copy_backward, random_access_iterator >(); - test_copy_backward, int*>(); - - test_copy_backward >(); - test_copy_backward >(); - test_copy_backward(); - -#if TEST_STD_VER > 17 - test_copy_backward, bidirectional_iterator>(); - test_copy_backward, random_access_iterator>(); - test_copy_backward, int*>(); - - test_copy_backward, contiguous_iterator>(); - test_copy_backward, contiguous_iterator>(); - test_copy_backward, contiguous_iterator>(); - test_copy_backward>(); -#endif + types::for_each(types::bidirectional_iterator_list(), TestIterators()); { // Make sure that padding bits aren't copied Derived src(1, 2, 3); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp index 57214e65455b4..3bee77738e342 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_if.pass.cpp @@ -19,75 +19,48 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" -struct Pred -{ - TEST_CONSTEXPR_CXX14 bool operator()(int i) {return i % 3 == 0;} +struct Pred { + TEST_CONSTEXPR_CXX14 bool operator()(int i) { return i % 3 == 0; } }; -template -TEST_CONSTEXPR_CXX20 void -test_copy_if() -{ +template +struct TestOutIters { + template + TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::copy_if(InIter(ia), InIter(ia+N), OutIter(ib), Pred()); - assert(base(r) == ib+N/3+1); - for (unsigned i = 0; i < N/3+1; ++i) - assert(ib[i] % 3 == 0); -} - -TEST_CONSTEXPR_CXX20 bool -test() -{ - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); - - test_copy_if, cpp17_output_iterator >(); - test_copy_if, cpp17_input_iterator >(); - test_copy_if, forward_iterator >(); - test_copy_if, bidirectional_iterator >(); - test_copy_if, random_access_iterator >(); - test_copy_if, int*>(); + OutIter r = std::copy_if(InIter(ia), InIter(ia + N), OutIter(ib), Pred()); + assert(base(r) == ib + N / 3 + 1); + for (unsigned i = 0; i < N / 3 + 1; ++i) + assert(ib[i] % 3 == 0); + } +}; - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if >(); - test_copy_if(); +struct TestInIters { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each( + types::concatenate_t, types::type_list > >(), + TestOutIters()); + } +}; +TEST_CONSTEXPR_CXX20 bool test() { + types::for_each(types::cpp17_input_iterator_list(), TestInIters()); return true; } -int main(int, char**) -{ - test(); +int main(int, char**) { + test(); #if TEST_STD_VER > 17 - static_assert(test()); + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp index 889e71f4eceb9..2053134a01a2f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/copy_n.pass.cpp @@ -18,6 +18,7 @@ #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" typedef UserDefinedIntegral UDI; @@ -37,37 +38,31 @@ class Derived : public PaddedBase { std::int8_t c_; }; -template -TEST_CONSTEXPR_CXX20 void test_copy_n() { - { - const unsigned N = 1000; - int ia[N] = {}; - for (unsigned i = 0; i < N; ++i) - ia[i] = i; - int ib[N] = {0}; - - OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib)); - assert(base(r) == ib + N / 2); - for (unsigned i = 0; i < N / 2; ++i) - assert(ia[i] == ib[i]); +struct TestIterators { + template + TEST_CONSTEXPR_CXX20 void operator()() { + types::for_each( + types::concatenate_t, types::type_list > >(), + TestImpl()); } - { // Make sure that padding bits aren't copied - Derived src(1, 2, 3); - Derived dst(4, 5, 6); - std::copy_n(static_cast(&src), 1, static_cast(&dst)); - assert(dst.a_ == 1); - assert(dst.b_ == 2); - assert(dst.c_ == 6); - } - - { // Make sure that overlapping ranges can be copied - int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - std::copy_n(a + 3, 7, a); - int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; - assert(std::equal(a, a + 10, expected)); - } -} + template + struct TestImpl { + template + TEST_CONSTEXPR_CXX20 void operator()() { + const unsigned N = 1000; + int ia[N] = {}; + for (unsigned i = 0; i < N; ++i) + ia[i] = i; + int ib[N] = {0}; + + OutIter r = std::copy_n(InIter(ia), UDI(N / 2), OutIter(ib)); + assert(base(r) == ib + N / 2); + for (unsigned i = 0; i < N / 2; ++i) + assert(ia[i] == ib[i]); + } + }; +}; TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -90,40 +85,23 @@ TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { } TEST_CONSTEXPR_CXX20 bool test() { - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n, cpp17_output_iterator >(); - test_copy_n, cpp17_input_iterator >(); - test_copy_n, forward_iterator >(); - test_copy_n, bidirectional_iterator >(); - test_copy_n, random_access_iterator >(); - test_copy_n, int*>(); - - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n >(); - test_copy_n(); + types::for_each(types::cpp17_input_iterator_list(), TestIterators()); + + { // Make sure that padding bits aren't copied + Derived src(1, 2, 3); + Derived dst(4, 5, 6); + std::copy_n(static_cast(&src), 1, static_cast(&dst)); + assert(dst.a_ == 1); + assert(dst.b_ == 2); + assert(dst.c_ == 6); + } + + { // Make sure that overlapping ranges can be copied + int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + std::copy_n(a + 3, 7, a); + int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; + assert(std::equal(a, a + 10, expected)); + } { // Test vector::iterator optimization assert(test_vector_bool(8)); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp index bee1ef9bcec33..6229aac733a9c 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(copy); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp index 128108ac13811..7208be75c70d0 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/pstl.copy_n.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(copy_n); @@ -58,7 +59,7 @@ struct TestIteratorsInt { }; struct CopiedToTester { - bool copied_to = false; + bool copied_to = false; CopiedToTester() = default; CopiedToTester(const CopiedToTester&) {} CopiedToTester& operator=(const CopiedToTester&) { diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp index c7031f63a02f6..577328d663d9f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.copy/ranges.copy_n.pass.cpp @@ -40,7 +40,7 @@ static_assert(!HasCopyNIt, std::ranges::in_out_result>); -template +template constexpr void test_iterators() { { // simple test std::array in{1, 2, 3, 4}; @@ -61,26 +61,6 @@ constexpr void test_iterators() { } } -template -constexpr void test_in_iterators() { - test_iterators, Out, sentinel_wrapper>>(); - test_iterators, Out>(); - test_iterators, Out>(); - test_iterators, Out>(); - test_iterators, Out>(); -} - -template -constexpr void test_proxy_in_iterators() { - test_iterators>, - Out, - sentinel_wrapper>>>(); - test_iterators>, Out>(); - test_iterators>, Out>(); - test_iterators>, Out>(); - test_iterators>, Out>(); -} - #if TEST_STD_VER >= 23 constexpr bool test_vector_bool(std::size_t N) { std::vector in(N, false); @@ -104,17 +84,12 @@ constexpr bool test_vector_bool(std::size_t N) { #endif constexpr bool test() { - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - test_in_iterators>(); - - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); - test_proxy_in_iterators>>(); + types::for_each(types::cpp20_input_iterator_list{}, []() { + types::for_each(types::cpp20_input_iterator_list{}, []() { + test_iterators(); + test_iterators, ProxyIterator>(); + }); + }); { // check that every element is copied exactly once struct CopyOnce { diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp index 7656be73c14c6..0e532ae834e7f 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill.pass.cpp @@ -22,6 +22,7 @@ #include "sized_allocator.h" #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" template TEST_CONSTEXPR_CXX20 void diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp index 3b67101a8b29e..98c412fb6cdc0 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/fill_n.pass.cpp @@ -14,108 +14,93 @@ // fill_n(Iter first, Size n, const T& value); #include +#include #include +#include #include #include "sized_allocator.h" #include "test_macros.h" #include "test_iterators.h" +#include "type_algorithms.h" #include "user_defined_integral.h" -#if TEST_STD_VER > 17 -TEST_CONSTEXPR bool test_constexpr() { - const std::size_t N = 5; - int ib[] = {0, 0, 0, 0, 0, 0}; // one bigger than N - - auto it = std::fill_n(std::begin(ib), N, 5); - return it == (std::begin(ib) + N) && std::all_of(std::begin(ib), it, [](int a) { return a == 5; }) && - *it == 0 // don't overwrite the last value in the output array - ; -} -#endif - typedef UserDefinedIntegral UDI; -template -void test_char() { - char a[4] = {}; - Iter it = std::fill_n(Iter(a), UDI(4), char(1)); - assert(base(it) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); +template +TEST_CONSTEXPR_CXX20 void +test(Container in, size_t from, size_t n, typename Container::value_type value, Container expected) { + Iter it = std::fill_n(Iter(in.data() + from), UDI(n), value); + assert(base(it) == in.data() + from + n); + assert(in == expected); } -template -void test_int() { - int a[4] = {}; - Iter it = std::fill_n(Iter(a), UDI(4), 1); - assert(base(it) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); -} +template +struct Test { + template + TEST_CONSTEXPR_CXX20 void operator()() { + { + std::array in = {1, 2, 3, 4}; + std::array expected = {5, 5, 5, 5}; + test(in, 0, 4, 5, expected); + } + { + std::array in = {1, 2, 3, 4}; + std::array expected = {1, 5, 5, 4}; + test(in, 1, 2, 5, expected); + } + } +}; -void test_int_array() { - int a[4] = {}; - assert(std::fill_n(a, UDI(4), static_cast(1)) == a + 4); - assert(a[0] == 1); - assert(a[1] == 1); - assert(a[2] == 1); - assert(a[3] == 1); +TEST_CONSTEXPR_CXX20 void test_int_array() { + { + int a[4] = {}; + assert(std::fill_n(a, UDI(4), static_cast(1)) == a + 4); + assert(a[0] == 1 && a[1] == 1 && a[2] == 1 && a[3] == 1); + } +#if TEST_STD_VER >= 11 + { + const std::size_t N = 5; + int ib[] = {0, 0, 0, 0, 0, 0}; // one bigger than N + + auto it = std::fill_n(std::begin(ib), N, 5); + assert(it == (std::begin(ib) + N) && std::all_of(std::begin(ib), it, [](int a) { return a == 5; }) && + *it == 0 // don't overwrite the last value in the output array + ); + } +#endif } struct source { - source() : i(0) {} - - operator int() const { return i++; } - mutable int i; + TEST_CONSTEXPR source() = default; + TEST_CONSTEXPR_CXX20 operator int() const { return 1; } }; -void test_int_array_struct_source() { +TEST_CONSTEXPR_CXX20 void test_int_array_struct_source() { int a[4] = {}; assert(std::fill_n(a, UDI(4), source()) == a + 4); - assert(a[0] == 0); + assert(a[0] == 1); assert(a[1] == 1); - assert(a[2] == 2); - assert(a[3] == 3); -} - -struct test1 { - test1() : c(0) {} - test1(char xc) : c(xc + 1) {} - char c; -}; - -void test_struct_array() { - test1 test1a[4] = {}; - assert(std::fill_n(test1a, UDI(4), static_cast(10)) == test1a + 4); - assert(test1a[0].c == 11); - assert(test1a[1].c == 11); - assert(test1a[2].c == 11); - assert(test1a[3].c == 11); + assert(a[2] == 1); + assert(a[3] == 1); } class A { char a_; public: - A() {} - explicit A(char a) : a_(a) {} - operator unsigned char() const { return 'b'; } + TEST_CONSTEXPR A() : a_('a') {}; + TEST_CONSTEXPR explicit A(char a) : a_(a) {} + TEST_CONSTEXPR operator unsigned char() const { return 'b'; } - friend bool operator==(const A& x, const A& y) { return x.a_ == y.a_; } + TEST_CONSTEXPR friend bool operator==(const A& x, const A& y) { return x.a_ == y.a_; } }; -void test5() { - A a[3]; - assert(std::fill_n(&a[0], UDI(3), A('a')) == a + 3); - assert(a[0] == A('a')); - assert(a[1] == A('a')); - assert(a[2] == A('a')); -} +struct B { + TEST_CONSTEXPR B() : c(0) {} + TEST_CONSTEXPR B(char xc) : c(xc + 1) {} + char c; +}; struct Storage { union { @@ -124,11 +109,6 @@ struct Storage { }; }; -void test6() { - Storage foo[5]; - std::fill_n(&foo[0], UDI(5), Storage()); -} - // Make sure std::fill_n behaves properly with std::vector iterators with custom size types. // See https://github.com/llvm/llvm-project/pull/122410. TEST_CONSTEXPR_CXX20 void test_bititer_with_custom_sized_types() { @@ -162,30 +142,44 @@ TEST_CONSTEXPR_CXX20 void test_bititer_with_custom_sized_types() { } } -int main(int, char**) { - test_char >(); - test_char >(); - test_char >(); - test_char >(); - test_char(); - - test_int >(); - test_int >(); - test_int >(); - test_int >(); - test_int(); +TEST_CONSTEXPR_CXX20 void test_struct_array() { + { + A a[3]; + assert(std::fill_n(&a[0], UDI(3), A('a')) == a + 3); + assert(a[0] == A('a')); + assert(a[1] == A('a')); + assert(a[2] == A('a')); + } + { + B b[4] = {}; + assert(std::fill_n(b, UDI(4), static_cast(10)) == b + 4); + assert(b[0].c == 11); + assert(b[1].c == 11); + assert(b[2].c == 11); + assert(b[3].c == 11); + } + { + Storage foo[5]; + std::fill_n(&foo[0], UDI(5), Storage()); + } +} + +TEST_CONSTEXPR_CXX20 bool test() { + types::for_each(types::forward_iterator_list(), Test()); + types::for_each(types::forward_iterator_list(), Test()); test_int_array(); - test_int_array_struct_source(); test_struct_array(); - - test5(); - test6(); - + test_int_array_struct_source(); test_bititer_with_custom_sized_types(); -#if TEST_STD_VER > 17 - static_assert(test_constexpr()); + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 20 + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp index 556326fb0894c..e456fa8986aad 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(fill); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp index 4abbd6f7a17c3..51232dfef1606 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.fill/pstl.fill_n.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(fill_n); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp index b1ad6873bc5e5..e28484ee4984b 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move.pass.cpp @@ -20,10 +20,12 @@ #include #include #include +#include #include "MoveOnly.h" #include "test_iterators.h" #include "test_macros.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -45,15 +47,15 @@ struct Test { template TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::move(InIter(ia), InIter(ia+N), OutIter(ib)); - assert(base(r) == ib+N); + OutIter r = std::move(InIter(ia), InIter(ia + N), OutIter(ib)); + assert(base(r) == ib + N); for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); + assert(ia[i] == ib[i]); } }; @@ -73,13 +75,13 @@ struct Test1 { const unsigned N = 100; std::unique_ptr ia[N]; for (unsigned i = 0; i < N; ++i) - ia[i].reset(new int(i)); + ia[i].reset(new int(i)); std::unique_ptr ib[N]; - OutIter r = std::move(InIter(ia), InIter(ia+N), OutIter(ib)); - assert(base(r) == ib+N); + OutIter r = std::move(InIter(ia), InIter(ia + N), OutIter(ib)); + assert(base(r) == ib + N); for (unsigned i = 0; i < N; ++i) - assert(*ib[i] == static_cast(i)); + assert(*ib[i] == static_cast(i)); } }; @@ -92,11 +94,32 @@ struct Test1OutIters { } }; +TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move with aligned bytes + std::vector in(v); + std::vector out(N); + std::move(in.begin(), in.end(), out.begin()); + assert(out == v); + } + { // Test move with unaligned bytes + std::vector in(v); + std::vector out(N); + std::move(in.begin() + 4, in.end(), out.begin()); + for (std::size_t i = 0; i < N - 4; ++i) + assert(v[i + 4] == out[i]); + } + + return true; +} + TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::cpp17_input_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) types::for_each(types::cpp17_input_iterator_list*>(), Test1OutIters()); - { // Make sure that padding bits aren't copied Derived src(1, 2, 3); Derived dst(4, 5, 6); @@ -105,20 +128,17 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::move(a + 3, a + 10, a); int expected[] = {4, 5, 6, 7, 8, 9, 10, 8, 9, 10}; assert(std::equal(a, a + 10, expected)); } - - // Make sure that the algorithm works with move-only types - { + { // Make sure that the algorithm works with move-only types // When non-trivial { MoveOnly from[3] = {1, 2, 3}; - MoveOnly to[3] = {}; + MoveOnly to[3] = {}; std::move(std::begin(from), std::end(from), std::begin(to)); assert(to[0] == MoveOnly(1)); assert(to[1] == MoveOnly(2)); @@ -127,7 +147,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When trivial { TrivialMoveOnly from[3] = {1, 2, 3}; - TrivialMoveOnly to[3] = {}; + TrivialMoveOnly to[3] = {}; std::move(std::begin(from), std::end(from), std::begin(to)); assert(to[0] == TrivialMoveOnly(1)); assert(to[1] == TrivialMoveOnly(2)); @@ -135,6 +155,16 @@ TEST_CONSTEXPR_CXX20 bool test() { } } + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } + return true; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp index 61dea47b51071..d8b7e68b155d6 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/move_backward.pass.cpp @@ -19,10 +19,12 @@ #include #include #include +#include #include "MoveOnly.h" #include "test_iterators.h" #include "test_macros.h" +#include "type_algorithms.h" class PaddedBase { public: @@ -44,24 +46,22 @@ struct Test { template TEST_CONSTEXPR_CXX20 void operator()() { const unsigned N = 1000; - int ia[N] = {}; + int ia[N] = {}; for (unsigned i = 0; i < N; ++i) - ia[i] = i; + ia[i] = i; int ib[N] = {0}; - OutIter r = std::move_backward(InIter(ia), InIter(ia+N), OutIter(ib+N)); + OutIter r = std::move_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); assert(base(r) == ib); for (unsigned i = 0; i < N; ++i) - assert(ia[i] == ib[i]); + assert(ia[i] == ib[i]); } }; struct TestOutIters { template TEST_CONSTEXPR_CXX20 void operator()() { - types::for_each( - types::concatenate_t >(), - Test()); + types::for_each(types::concatenate_t >(), Test()); } }; @@ -72,29 +72,50 @@ struct Test1 { const unsigned N = 100; std::unique_ptr ia[N]; for (unsigned i = 0; i < N; ++i) - ia[i].reset(new int(i)); + ia[i].reset(new int(i)); std::unique_ptr ib[N]; - OutIter r = std::move_backward(InIter(ia), InIter(ia+N), OutIter(ib+N)); + OutIter r = std::move_backward(InIter(ia), InIter(ia + N), OutIter(ib + N)); assert(base(r) == ib); for (unsigned i = 0; i < N; ++i) - assert(*ib[i] == static_cast(i)); + assert(*ib[i] == static_cast(i)); } }; struct Test1OutIters { template TEST_CONSTEXPR_CXX23 void operator()() { - types::for_each(types::concatenate_t*> >(), - Test1()); + types::for_each( + types::concatenate_t*> >(), Test1()); } }; +TEST_CONSTEXPR_CXX20 bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move_backward with aligned bytes + std::vector in(v); + std::vector out(N); + std::move_backward(in.begin(), in.end(), out.end()); + assert(out == v); + } + { // Test move_backward with unaligned bytes + std::vector in(v); + std::vector out(N); + std::move_backward(in.begin(), in.end() - 4, out.end()); + for (std::size_t i = 0; i < N - 4; ++i) + assert(out[i + 4] == v[i]); + } + + return true; +} + TEST_CONSTEXPR_CXX20 bool test() { types::for_each(types::bidirectional_iterator_list(), TestOutIters()); if (TEST_STD_AT_LEAST_23_OR_RUNTIME_EVALUATED) types::for_each(types::bidirectional_iterator_list*>(), Test1OutIters()); - { // Make sure that padding bits aren't copied Derived src(1, 2, 3); Derived dst(4, 5, 6); @@ -104,20 +125,17 @@ TEST_CONSTEXPR_CXX20 bool test() { assert(dst.b_ == 2); assert(dst.c_ == 6); } - { // Make sure that overlapping ranges can be copied int a[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; std::move_backward(a, a + 7, a + 10); int expected[] = {1, 2, 3, 1, 2, 3, 4, 5, 6, 7}; assert(std::equal(a, a + 10, expected)); } - - // Make sure that the algorithm works with move-only types - { + { // Make sure that the algorithm works with move-only types // When non-trivial { MoveOnly from[3] = {1, 2, 3}; - MoveOnly to[3] = {}; + MoveOnly to[3] = {}; std::move_backward(std::begin(from), std::end(from), std::end(to)); assert(to[0] == MoveOnly(1)); assert(to[1] == MoveOnly(2)); @@ -126,7 +144,7 @@ TEST_CONSTEXPR_CXX20 bool test() { // When trivial { TrivialMoveOnly from[3] = {1, 2, 3}; - TrivialMoveOnly to[3] = {}; + TrivialMoveOnly to[3] = {}; std::move_backward(std::begin(from), std::end(from), std::end(to)); assert(to[0] == TrivialMoveOnly(1)); assert(to[1] == TrivialMoveOnly(2)); @@ -134,11 +152,20 @@ TEST_CONSTEXPR_CXX20 bool test() { } } + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } + return true; } -int main(int, char**) -{ +int main(int, char**) { test(); #if TEST_STD_VER >= 20 static_assert(test()); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp index e4cc5649ce5d8..a82a068caf031 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/pstl.move.pass.cpp @@ -23,6 +23,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "type_algorithms.h" EXECUTION_POLICY_SFINAE_TEST(move); diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp index a0d1473360a14..1a89408865892 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move.pass.cpp @@ -31,6 +31,7 @@ #include "almost_satisfies_types.h" #include "MoveOnly.h" #include "test_iterators.h" +#include "test_macros.h" template > concept HasMoveIt = requires(In in, Sent sent, Out out) { std::ranges::move(in, sent, out); }; @@ -65,7 +66,7 @@ constexpr void test(std::array in) { { std::array out; std::same_as> decltype(auto) ret = - std::ranges::move(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data())); + std::ranges::move(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data() + out.size()); @@ -73,8 +74,7 @@ constexpr void test(std::array in) { { std::array out; auto range = std::ranges::subrange(In(in.data()), Sent(In(in.data() + in.size()))); - std::same_as> decltype(auto) ret = - std::ranges::move(range, Out(out.data())); + std::same_as> decltype(auto) ret = std::ranges::move(range, Out(out.data())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data() + out.size()); @@ -84,16 +84,16 @@ constexpr void test(std::array in) { template constexpr void test_containers() { { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); std::same_as> auto ret = - std::ranges::move(In(in.begin()), Sent(In(in.end())), Out(out.begin())); + std::ranges::move(In(in.begin()), Sent(In(in.end())), Out(out.begin())); assert(std::ranges::equal(in, out)); assert(base(ret.in) == in.end()); assert(base(ret.out) == out.end()); } { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); auto range = std::ranges::subrange(In(in.begin()), Sent(In(in.end()))); std::same_as> auto ret = std::ranges::move(range, Out(out.begin())); @@ -165,22 +165,52 @@ constexpr void test_proxy_in_iterators() { } struct IteratorWithMoveIter { - using value_type = int; - using difference_type = int; + using value_type = int; + using difference_type = int; explicit IteratorWithMoveIter() = default; int* ptr; constexpr IteratorWithMoveIter(int* ptr_) : ptr(ptr_) {} constexpr int& operator*() const; // iterator with iter_move should not be dereferenced - constexpr IteratorWithMoveIter& operator++() { ++ptr; return *this; } - constexpr IteratorWithMoveIter operator++(int) { auto ret = *this; ++*this; return ret; } + constexpr IteratorWithMoveIter& operator++() { + ++ptr; + return *this; + } + constexpr IteratorWithMoveIter operator++(int) { + auto ret = *this; + ++*this; + return ret; + } friend constexpr int iter_move(const IteratorWithMoveIter&) { return 42; } constexpr bool operator==(const IteratorWithMoveIter& other) const = default; }; +#if TEST_STD_VER >= 23 +constexpr bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move with aligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move(in, out.begin()); + assert(out == v); + } + { // Test move with unaligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move(std::views::counted(in.begin() + 4, N - 4), out.begin()); + assert(std::ranges::equal(v | std::views::drop(4), out | std::views::take(N - 4))); + } + + return true; +} +#endif + // cpp17_intput_iterator has a defaulted template argument template using Cpp17InIter = cpp17_input_iterator; @@ -267,13 +297,13 @@ constexpr bool test() { { // check that ranges::dangling is returned std::array out; std::same_as> decltype(auto) ret = - std::ranges::move(std::array {1, 2, 3, 4}, out.data()); + std::ranges::move(std::array{1, 2, 3, 4}, out.data()); assert(ret.out == out.data() + 4); assert((out == std::array{1, 2, 3, 4})); } { // check that an iterator is returned with a borrowing range - std::array in {1, 2, 3, 4}; + std::array in{1, 2, 3, 4}; std::array out; std::same_as::iterator, int*>> decltype(auto) ret = std::ranges::move(std::views::all(in), out.data()); @@ -284,8 +314,8 @@ constexpr bool test() { { // check that every element is moved exactly once struct MoveOnce { - bool moved = false; - constexpr MoveOnce() = default; + bool moved = false; + constexpr MoveOnce() = default; constexpr MoveOnce(const MoveOnce& other) = delete; constexpr MoveOnce& operator=(MoveOnce&& other) { assert(!other.moved); @@ -294,16 +324,16 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.moved; })); } { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move(in, out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); @@ -314,8 +344,8 @@ constexpr bool test() { { // check that the range is moved forwards struct OnlyForwardsMovable { OnlyForwardsMovable* next = nullptr; - bool canMove = false; - OnlyForwardsMovable() = default; + bool canMove = false; + OnlyForwardsMovable() = default; constexpr OnlyForwardsMovable& operator=(OnlyForwardsMovable&&) { assert(canMove); if (next != nullptr) @@ -324,12 +354,12 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; - out[0].next = &out[1]; - out[1].next = &out[2]; + std::array in{}; + std::array out{}; + out[0].next = &out[1]; + out[1].next = &out[2]; out[0].canMove = true; - auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); + auto ret = std::ranges::move(in.begin(), in.end(), out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(out[0].canMove); @@ -337,12 +367,12 @@ constexpr bool test() { assert(out[2].canMove); } { - std::array in {}; - std::array out {}; - out[0].next = &out[1]; - out[1].next = &out[2]; + std::array in{}; + std::array out{}; + out[0].next = &out[1]; + out[1].next = &out[2]; out[0].canMove = true; - auto ret = std::ranges::move(in, out.begin()); + auto ret = std::ranges::move(in, out.begin()); assert(ret.in == in.end()); assert(ret.out == out.end()); assert(out[0].canMove); @@ -358,19 +388,31 @@ constexpr bool test() { auto ret = std::ranges::move(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4), b.data()); assert(ret.in == a + 4); assert(ret.out == b.data() + 4); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } { int a[] = {1, 2, 3, 4}; std::array b; auto range = std::ranges::subrange(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4)); - auto ret = std::ranges::move(range, b.data()); + auto ret = std::ranges::move(range, b.data()); assert(ret.in == a + 4); assert(ret.out == b.data() + 4); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } } +#if TEST_STD_VER >= 23 + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } +#endif + return true; } diff --git a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp index 47cf178636ad1..923b4c790dd1d 100644 --- a/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp +++ b/libcxx/test/std/algorithms/alg.modifying.operations/alg.move/ranges.move_backward.pass.cpp @@ -31,6 +31,7 @@ #include "almost_satisfies_types.h" #include "MoveOnly.h" #include "test_iterators.h" +#include "test_macros.h" template > concept HasMoveBackwardIt = requires(In in, Sent sent, Out out) { std::ranges::move_backward(in, sent, out); }; @@ -65,7 +66,7 @@ constexpr void test(std::array in) { { std::array out; std::same_as> decltype(auto) ret = - std::ranges::move_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size())); + std::ranges::move_backward(In(in.data()), Sent(In(in.data() + in.size())), Out(out.data() + out.size())); assert(in == out); assert(base(ret.in) == in.data() + in.size()); assert(base(ret.out) == out.data()); @@ -92,16 +93,16 @@ constexpr void test_iterators() { template constexpr void test_containers() { { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); std::same_as> auto ret = - std::ranges::move_backward(In(in.begin()), Sent(In(in.end())), Out(out.end())); + std::ranges::move_backward(In(in.begin()), Sent(In(in.end())), Out(out.end())); assert(std::ranges::equal(in, out)); assert(base(ret.in) == in.end()); assert(base(ret.out) == out.begin()); } { - InContainer in {1, 2, 3, 4}; + InContainer in{1, 2, 3, 4}; OutContainer out(4); auto range = std::ranges::subrange(In(in.begin()), Sent(In(in.end()))); std::same_as> auto ret = std::ranges::move_backward(range, Out(out.end())); @@ -159,25 +160,62 @@ constexpr void test_proxy_in_iterators() { } struct IteratorWithMoveIter { - using value_type = int; - using difference_type = int; + using value_type = int; + using difference_type = int; explicit IteratorWithMoveIter() = default; int* ptr; constexpr IteratorWithMoveIter(int* ptr_) : ptr(ptr_) {} constexpr int& operator*() const; // iterator with iter_move should not be dereferenced - constexpr IteratorWithMoveIter& operator++() { ++ptr; return *this; } - constexpr IteratorWithMoveIter operator++(int) { auto ret = *this; ++*this; return ret; } + constexpr IteratorWithMoveIter& operator++() { + ++ptr; + return *this; + } + constexpr IteratorWithMoveIter operator++(int) { + auto ret = *this; + ++*this; + return ret; + } - constexpr IteratorWithMoveIter& operator--() { --ptr; return *this; } - constexpr IteratorWithMoveIter operator--(int) { auto ret = *this; --*this; return ret; } + constexpr IteratorWithMoveIter& operator--() { + --ptr; + return *this; + } + constexpr IteratorWithMoveIter operator--(int) { + auto ret = *this; + --*this; + return ret; + } friend constexpr int iter_move(const IteratorWithMoveIter&) { return 42; } constexpr bool operator==(const IteratorWithMoveIter& other) const = default; }; +#if TEST_STD_VER >= 23 +constexpr bool test_vector_bool(std::size_t N) { + std::vector v(N, false); + for (std::size_t i = 0; i < N; i += 2) + v[i] = true; + + { // Test move_backward with aligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move_backward(in, out.end()); + assert(out == v); + } + { // Test move_backward with unaligned bytes + std::vector in{v}; + std::vector out(N); + std::ranges::move_backward(std::views::counted(in.begin(), N - 4), out.end()); + assert(std::ranges::equal(v | std::views::take(N - 4), out | std::views::drop(4))); + } + + return true; +} +#endif + constexpr bool test() { test_in_iterators(); test_in_iterators(); @@ -243,7 +281,8 @@ constexpr bool test() { MoveOnly b[3]; ProxyRange proxyA{a}; ProxyRange proxyB{b}; - std::ranges::move_backward(std::begin(proxyA), std::end(proxyA), std::ranges::next(proxyB.begin(), std::end(proxyB))); + std::ranges::move_backward( + std::begin(proxyA), std::end(proxyA), std::ranges::next(proxyB.begin(), std::end(proxyB))); assert(b[0].get() == 1); assert(b[1].get() == 2); assert(b[2].get() == 3); @@ -253,13 +292,13 @@ constexpr bool test() { { // check that ranges::dangling is returned std::array out; std::same_as> auto ret = - std::ranges::move_backward(std::array {1, 2, 3, 4}, out.data() + out.size()); + std::ranges::move_backward(std::array{1, 2, 3, 4}, out.data() + out.size()); assert(ret.out == out.data()); assert((out == std::array{1, 2, 3, 4})); } { // check that an iterator is returned with a borrowing range - std::array in {1, 2, 3, 4}; + std::array in{1, 2, 3, 4}; std::array out; std::same_as::iterator, int*>> auto ret = std::ranges::move_backward(std::views::all(in), out.data() + out.size()); @@ -270,8 +309,8 @@ constexpr bool test() { { // check that every element is moved exactly once struct MoveOnce { - bool moved = false; - constexpr MoveOnce() = default; + bool moved = false; + constexpr MoveOnce() = default; constexpr MoveOnce(const MoveOnce& other) = delete; constexpr MoveOnce& operator=(const MoveOnce& other) { assert(!other.moved); @@ -280,16 +319,16 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(std::all_of(out.begin(), out.end(), [](const auto& e) { return e.moved; })); } { - std::array in {}; - std::array out {}; + std::array in{}; + std::array out{}; auto ret = std::ranges::move_backward(in, out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); @@ -300,8 +339,8 @@ constexpr bool test() { { // check that the range is moved backwards struct OnlyBackwardsMovable { OnlyBackwardsMovable* next = nullptr; - bool canMove = false; - OnlyBackwardsMovable() = default; + bool canMove = false; + OnlyBackwardsMovable() = default; constexpr OnlyBackwardsMovable& operator=(const OnlyBackwardsMovable&) { assert(canMove); if (next != nullptr) @@ -310,12 +349,12 @@ constexpr bool test() { } }; { - std::array in {}; - std::array out {}; - out[1].next = &out[0]; - out[2].next = &out[1]; + std::array in{}; + std::array out{}; + out[1].next = &out[0]; + out[2].next = &out[1]; out[2].canMove = true; - auto ret = std::ranges::move_backward(in, out.end()); + auto ret = std::ranges::move_backward(in, out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(out[0].canMove); @@ -323,12 +362,12 @@ constexpr bool test() { assert(out[2].canMove); } { - std::array in {}; - std::array out {}; - out[1].next = &out[0]; - out[2].next = &out[1]; + std::array in{}; + std::array out{}; + out[1].next = &out[0]; + out[2].next = &out[1]; out[2].canMove = true; - auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); + auto ret = std::ranges::move_backward(in.begin(), in.end(), out.end()); assert(ret.in == in.end()); assert(ret.out == out.begin()); assert(out[0].canMove); @@ -344,19 +383,31 @@ constexpr bool test() { auto ret = std::ranges::move_backward(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4), b.data() + b.size()); assert(ret.in == a + 4); assert(ret.out == b.data()); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } { int a[] = {1, 2, 3, 4}; std::array b; auto range = std::ranges::subrange(IteratorWithMoveIter(a), IteratorWithMoveIter(a + 4)); - auto ret = std::ranges::move_backward(range, b.data() + b.size()); + auto ret = std::ranges::move_backward(range, b.data() + b.size()); assert(ret.in == a + 4); assert(ret.out == b.data()); - assert((b == std::array {42, 42, 42, 42})); + assert((b == std::array{42, 42, 42, 42})); } } +#if TEST_STD_VER >= 23 + { // Test vector::iterator optimization + assert(test_vector_bool(8)); + assert(test_vector_bool(19)); + assert(test_vector_bool(32)); + assert(test_vector_bool(49)); + assert(test_vector_bool(64)); + assert(test_vector_bool(199)); + assert(test_vector_bool(256)); + } +#endif + return true; } diff --git a/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp b/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp new file mode 100644 index 0000000000000..9b22cbda9f345 --- /dev/null +++ b/libcxx/test/std/utilities/variant/variant.hash/hash.depr.verify.cpp @@ -0,0 +1,33 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// REQUIRES: std-at-least-c++17 + +#include + +#include "test_macros.h" + +using A1 [[maybe_unused]] = std::hash>::argument_type; +using R1 [[maybe_unused]] = std::hash>::result_type; +#if TEST_STD_VER >= 20 +// expected-error@-3 {{no type named 'argument_type' in 'std::hash>'}} +// expected-error@-3 {{no type named 'result_type' in 'std::hash>'}} +#else +// expected-warning@-6 {{'argument_type' is deprecated}} +// expected-warning@-6 {{'result_type' is deprecated}} +#endif + +using A2 [[maybe_unused]] = std::hash::argument_type; +using R2 [[maybe_unused]] = std::hash::result_type; +#if TEST_STD_VER >= 20 +// expected-error@-3 {{no type named 'argument_type' in 'std::hash'}} +// expected-error@-3 {{no type named 'result_type' in 'std::hash'}} +#else +// expected-warning@-6 {{'argument_type' is deprecated}} +// expected-warning@-6 {{'result_type' is deprecated}} +#endif diff --git a/lld/test/ELF/aarch64-feature-gcs.s b/lld/test/ELF/aarch64-feature-gcs.s index 7a08673dbb7e6..b53a653dddaee 100644 --- a/lld/test/ELF/aarch64-feature-gcs.s +++ b/lld/test/ELF/aarch64-feature-gcs.s @@ -36,15 +36,15 @@ ## gcs-report should report any input files that don't have the gcs property. -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: not ld.lld func2-gcs.o func3.o --shared -o /dev/null -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning 2>&1 | count 0 -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=always 2>&1 | count 0 -# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -o /dev/null -z gcs-report=warning -z gcs=never 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: ld.lld func1-gcs.o func2.o func3-gcs.o -z gcs-report=warning -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-WARN %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error -z gcs=always 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: not ld.lld func2-gcs.o func3.o --shared -z gcs-report=error -z gcs=never 2>&1 | FileCheck --check-prefix=REPORT-ERROR %s +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning -z gcs=always 2>&1 | count 0 +# RUN: ld.lld func1-gcs.o func2-gcs.o func3-gcs.o -z gcs-report=warning -z gcs=never 2>&1 | count 0 # REPORT-WARN: warning: func2.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property # REPORT-ERROR: error: func3.o: -z gcs-report: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_GCS property diff --git a/lld/test/ELF/aarch64-relocs.s b/lld/test/ELF/aarch64-relocs.s index 198674c085b54..39cfcdd38661d 100644 --- a/lld/test/ELF/aarch64-relocs.s +++ b/lld/test/ELF/aarch64-relocs.s @@ -25,12 +25,10 @@ mystr: .asciz "blah" .size mystr, 4 -# PAGE(S + A) - PAGE(P) = PAGE(210136) - PAGE(0x210132) = 0 -# # CHECK: Disassembly of section .R_AARCH64_ADR_PREL_PG_HI21: # CHECK-EMPTY: # CHECK-NEXT: <.R_AARCH64_ADR_PREL_PG_HI21>: -# CHECK-NEXT: 210132: 90000001 adrp x1, 0x210000 +# CHECK-NEXT: adrp x1, 0x210000 .section .R_AARCH64_ADD_ABS_LO12_NC,"ax",@progbits add x0, x0, :lo12:.L.str @@ -64,39 +62,16 @@ foo: nop sub: nop - -# CHECK: Disassembly of section .SUB: -# CHECK-EMPTY: -# CHECK-NEXT: <.SUB>: -# CHECK-NEXT: 21014c: d503201f nop -# CHECK: : -# CHECK-NEXT: 210150: d503201f nop - .section .R_AARCH64_CALL26,"ax",@progbits call26: bl sub + b sub -# S = 0x21014c, A = 0x4, P = 0x210154 -# R = S + A - P = -0x4 = 0xfffffffc -# (R & 0x0ffffffc) >> 2 = 0x03ffffff -# 0x94000000 | 0x03ffffff = 0x97ffffff # CHECK: Disassembly of section .R_AARCH64_CALL26: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: 210154: 97ffffff bl 0x210150 - -.section .R_AARCH64_JUMP26,"ax",@progbits -jump26: - b sub - -# S = 0x21014c, A = 0x4, P = 0x210158 -# R = S + A - P = -0x8 = 0xfffffff8 -# (R & 0x0ffffffc) >> 2 = 0x03fffffe -# 0x14000000 | 0x03fffffe = 0x17fffffe -# CHECK: Disassembly of section .R_AARCH64_JUMP26: -# CHECK-EMPTY: -# CHECK-NEXT: : -# CHECK-NEXT: 210158: 17fffffe b 0x210150 +# CHECK-NEXT: bl {{.*}} +# CHECK-NEXT: b {{.*}} .section .R_AARCH64_LDST32_ABS_LO12_NC,"ax",@progbits ldst32: @@ -179,14 +154,14 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_UABS: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: f280018c movk x12, #12 -# CHECK-NEXT: f280018c movk x12, #12 -# CHECK-NEXT: f2a001ad movk x13, #13, lsl #16 -# CHECK-NEXT: f2a001ad movk x13, #13, lsl #16 -# CHECK-NEXT: f2c001ce movk x14, #14, lsl #32 -# CHECK-NEXT: f2c001ce movk x14, #14, lsl #32 -# CHECK-NEXT: d2e001ef mov x15, #4222124650659840 -# CHECK-NEXT: f2e001f0 movk x16, #15, lsl #48 +# CHECK-NEXT: movk x12, #12 +# CHECK-NEXT: movk x12, #12 +# CHECK-NEXT: movk x13, #13, lsl #16 +# CHECK-NEXT: movk x13, #13, lsl #16 +# CHECK-NEXT: movk x14, #14, lsl #32 +# CHECK-NEXT: movk x14, #14, lsl #32 +# CHECK-NEXT: mov x15, #4222124650659840 +# CHECK-NEXT: movk x16, #15, lsl #48 .section .R_AARCH64_MOVW_SABS,"ax",@progbits movz x1, #:abs_g0_s:zero+1 @@ -199,15 +174,15 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_SABS: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: d2800021 mov x1, #1 -# CHECK-NEXT: 92800001 mov x1, #-1 -# CHECK-NEXT: d2a00042 mov x2, #131072 +# CHECK-NEXT: mov x1, #1 +# CHECK-NEXT: mov x1, #-1 +# CHECK-NEXT: mov x2, #131072 ## -65537 = 0xfffffffffffeffff -# CHECK-NEXT: 92a00022 mov x2, #-65537 +# CHECK-NEXT: mov x2, #-65537 ## 12884901888 = 0x300000000 -# CHECK-NEXT: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #12884901888 ## -8589934593 = #0xfffffffdffffffff -# CHECK-NEXT: 92c00043 mov x3, #-8589934593 +# CHECK-NEXT: mov x3, #-8589934593 .section .R_AARCH64_MOVW_PREL,"ax",@progbits movz x1, #:prel_g0:.+1 @@ -231,24 +206,24 @@ movz1: # CHECK: Disassembly of section .R_AARCH64_MOVW_PREL: # CHECK-EMPTY: # CHECK-NEXT: : -# CHECK-NEXT: 2101bc: d2800021 mov x1, #1 -# CHECK-NEXT: 2101c0: 92800001 mov x1, #-1 -# CHECK-NEXT: 2101c4: f2800021 movk x1, #1 -# CHECK-NEXT: 2101c8: f29fffe1 movk x1, #65535 -# CHECK-NEXT: 2101cc: d2a00042 mov x2, #131072 +# CHECK-NEXT: mov x1, #1 +# CHECK-NEXT: mov x1, #-1 +# CHECK-NEXT: movk x1, #1 +# CHECK-NEXT: movk x1, #65535 +# CHECK-NEXT: mov x2, #131072 ## -65537 = 0xfffffffffffeffff -# CHECK-NEXT: 2101d0: 92a00022 mov x2, #-65537 -# CHECK-NEXT: 2101d4: f2a00042 movk x2, #2, lsl #16 -# CHECK-NEXT: 2101d8: f2bfffc2 movk x2, #65534, lsl #16 +# CHECK-NEXT: mov x2, #-65537 +# CHECK-NEXT: movk x2, #2, lsl #16 +# CHECK-NEXT: movk x2, #65534, lsl #16 ## 12884901888 = 0x300000000 -# CHECK-NEXT: 2101dc: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #12884901888 ## -8589934593 = #0xfffffffdffffffff -# CHECK-NEXT: 2101e0: 92c00043 mov x3, #-8589934593 -# CHECK-NEXT: 2101e4: f2c00063 movk x3, #3, lsl #32 -# CHECK-NEXT: 2101e8: f2dfffa3 movk x3, #65533, lsl #32 -# CHECK-NEXT: 2101ec: d2c00063 mov x3, #12884901888 +# CHECK-NEXT: mov x3, #-8589934593 +# CHECK-NEXT: movk x3, #3, lsl #32 +# CHECK-NEXT: movk x3, #65533, lsl #32 +# CHECK-NEXT: mov x3, #12884901888 ## 1125899906842624 = 0x4000000000000 -# CHECK-NEXT: 2101f0: d2e00084 mov x4, #1125899906842624 -# CHECK-NEXT: 2101f4: d2ffff84 mov x4, #-1125899906842624 -# CHECK-NEXT: 2101f8: f2e00084 movk x4, #4, lsl #48 -# CHECK-NEXT: 2101fc: f2ffff84 movk x4, #65532, lsl #48 +# CHECK-NEXT: mov x4, #1125899906842624 +# CHECK-NEXT: mov x4, #-1125899906842624 +# CHECK-NEXT: movk x4, #4, lsl #48 +# CHECK-NEXT: movk x4, #65532, lsl #48 diff --git a/lld/test/ELF/allow-shlib-undefined-weak.s b/lld/test/ELF/allow-shlib-undefined-weak.s index 1037cbed0d859..141881fd73673 100644 --- a/lld/test/ELF/allow-shlib-undefined-weak.s +++ b/lld/test/ELF/allow-shlib-undefined-weak.s @@ -21,7 +21,7 @@ # RUN: ld.lld -shared wrap.o def.so -o wrap.so # RUN: llvm-mc -filetype=obj -triple=x86_64 start.s -o start.o -# RUN: ld.lld --no-allow-shlib-undefined start.o wrap.so ref.so -o /dev/null 2>&1 | count 0 +# RUN: ld.lld --no-allow-shlib-undefined start.o wrap.so ref.so 2>&1 | count 0 #--- start.s .globl _start diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s index c69c1ea20ce3b..a088c2595d538 100644 --- a/lld/test/ELF/allow-shlib-undefined.s +++ b/lld/test/ELF/allow-shlib-undefined.s @@ -9,40 +9,40 @@ # RUN: cp a.so b.so # RUN: llvm-mc -filetype=obj -triple=x86_64 empty.s -o empty.o && ld.lld -shared empty.o -o empty.so -# RUN: ld.lld --allow-shlib-undefined main.o a.so -o /dev/null -# RUN: not ld.lld --no-allow-shlib-undefined main.o a.so -o /dev/null 2>&1 | FileCheck %s +# RUN: ld.lld --allow-shlib-undefined main.o a.so +# RUN: not ld.lld --no-allow-shlib-undefined main.o a.so 2>&1 | FileCheck %s ## Executable linking defaults to --no-allow-shlib-undefined. -# RUN: not ld.lld main.o a.so -o /dev/null 2>&1 | FileCheck %s -# RUN: ld.lld main.o a.so --noinhibit-exec -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN -# RUN: ld.lld main.o a.so --warn-unresolved-symbols -o /dev/null 2>&1 | FileCheck %s --check-prefix=WARN +# RUN: not ld.lld main.o a.so 2>&1 | FileCheck %s +# RUN: ld.lld main.o a.so --noinhibit-exec 2>&1 | FileCheck %s --check-prefix=WARN +# RUN: ld.lld main.o a.so --warn-unresolved-symbols 2>&1 | FileCheck %s --check-prefix=WARN ## -shared linking defaults to --allow-shlib-undefined. -# RUN: ld.lld -shared main.o a.so -o /dev/null +# RUN: ld.lld -shared main.o a.so ## DSO with undefines should link with or without any of these options. -# RUN: ld.lld -shared --allow-shlib-undefined a.o -o /dev/null -# RUN: ld.lld -shared --no-allow-shlib-undefined a.o -o /dev/null +# RUN: ld.lld -shared --allow-shlib-undefined a.o +# RUN: ld.lld -shared --no-allow-shlib-undefined a.o ## Perform checking even if an unresolved symbol is first seen in a regular object file. -# RUN: not ld.lld --gc-sections main.o ref.o a.so -o /dev/null 2>&1 | FileCheck %s +# RUN: not ld.lld --gc-sections main.o ref.o a.so 2>&1 | FileCheck %s ## Check that the error is reported for each shared library where the symbol ## is referenced. -# RUN: not ld.lld main.o a.so empty.so b.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK2 +# RUN: not ld.lld main.o a.so empty.so b.so 2>&1 | FileCheck %s --check-prefix=CHECK2 ## Test some cases when a relocatable object file provides a non-exported definition. -# RUN: not ld.lld main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o def-hidden.o a.so -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings -o /dev/null +# RUN: not ld.lld main.o a.so def-hidden.o 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o a.so def-hidden.o -shared --no-allow-shlib-undefined 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: ld.lld main.o a.so def-hidden.o --allow-shlib-undefined --fatal-warnings ## Test a relocatable object file definition that is converted to STB_LOCAL. -# RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED -# RUN: not ld.lld main.o def-hidden.o a.so --version-script=local.ver -o /dev/null 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o a.so def-hidden.o --version-script=local.ver 2>&1 | FileCheck %s --check-prefix=NONEXPORTED +# RUN: not ld.lld main.o def-hidden.o a.so --version-script=local.ver 2>&1 | FileCheck %s --check-prefix=NONEXPORTED ## The section containing the definition is discarded, and we report an error. -# RUN: not ld.lld --gc-sections main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s +# RUN: not ld.lld --gc-sections main.o a.so def-hidden.o 2>&1 | FileCheck %s ## The definition def.so is ignored. # RUN: ld.lld -shared def.o -o def.so -# RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings -o /dev/null +# RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings # CHECK-NOT: error: # CHECK: error: undefined reference: x1{{$}} diff --git a/lld/test/ELF/arm-cmse-diagnostics.s b/lld/test/ELF/arm-cmse-diagnostics.s index d30f2431cc57a..4c8a4097e8250 100644 --- a/lld/test/ELF/arm-cmse-diagnostics.s +++ b/lld/test/ELF/arm-cmse-diagnostics.s @@ -7,11 +7,11 @@ // RUN: llvm-mc -arm-add-build-attributes -filetype=obj --triple=thumbv8m.base lib -o lib.o // RUN: llvm-mc -arm-add-build-attributes -filetype=obj --triple=thumbv8m.base app -I %S/Inputs -o app.o // RUN: llvm-objcopy --redefine-sym=entry7_duplicate=entry6_duplicate lib.o -// RUN: not ld.lld --cmse-implib --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IMPLIB -// RUN: not ld.lld --cmse-implib --in-implib=lib.o --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_MULT_INIMPLIB -// RUN: not ld.lld --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB -// RUN: not ld.lld --out-implib=out.lib app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_OUT_IMPLIB -// RUN: not ld.lld --out-implib=out.lib --in-implib=lib.o app.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB,ERR_OUT_IMPLIB +// RUN: not ld.lld --cmse-implib --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IMPLIB +// RUN: not ld.lld --cmse-implib --in-implib=lib.o --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_MULT_INIMPLIB +// RUN: not ld.lld --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB +// RUN: not ld.lld --out-implib=out.lib app.o 2>&1 | FileCheck %s --check-prefixes=ERR_OUT_IMPLIB +// RUN: not ld.lld --out-implib=out.lib --in-implib=lib.o app.o 2>&1 | FileCheck %s --check-prefixes=ERR_IN_IMPLIB,ERR_OUT_IMPLIB // ERR_IMPLIB: error: CMSE symbol 'entry_not_external' in import library '{{.*}}' is not global // ERR_IMPLIB: error: CMSE symbol 'entry_not_absolute' in import library '{{.*}}' is not absolute @@ -91,7 +91,7 @@ /// Test diagnostics emitted during symbol attribute checks. // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base symattr -o symattr.o -// RUN: not ld.lld --cmse-implib symattr.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=ERR_SYMATTR +// RUN: not ld.lld --cmse-implib symattr.o 2>&1 | FileCheck %s --check-prefixes=ERR_SYMATTR // ERR_SYMATTR-NOT: __acle_se_valid_{{.*}} // ERR_SYMATTR: error: {{.*}}: cmse special symbol '__acle_se_invalid_1' is not a Thumb function definition @@ -110,9 +110,9 @@ /// Test diagnostics emitted when a symbol is removed from a later version of the import library. // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base libv1 -o libv1.o // RUN: llvm-mc -arm-add-build-attributes -filetype=obj -I %S/Inputs --triple=thumbv8m.base libv2 -o libv2.o -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --out-implib=libv1.lib -o /dev/null -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv2.o --in-implib=libv1.lib --out-implib=libv2.lib -o /dev/null 2>&1 | FileCheck %s --check-prefixes=WARN_MISSING -// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --in-implib=libv2.lib -o /dev/null 2>&1 | FileCheck %s --check-prefixes=WARN_NEWENTRY +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --out-implib=libv1.lib +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv2.o --in-implib=libv1.lib --out-implib=libv2.lib 2>&1 | FileCheck %s --check-prefixes=WARN_MISSING +// RUN: ld.lld -Ttext=0x8000 --section-start .gnu.sgstubs=0x20000 --cmse-implib libv1.o --in-implib=libv2.lib 2>&1 | FileCheck %s --check-prefixes=WARN_NEWENTRY // WARN_MISSING: warning: entry function 'bar' from CMSE import library is not present in secure application // WARN_NEWENTRY: warning: new entry function 'bar' introduced but no output import library specified diff --git a/lld/test/ELF/avr-reloc-error.s b/lld/test/ELF/avr-reloc-error.s index f177e44f753fa..b36a24d764c5c 100644 --- a/lld/test/ELF/avr-reloc-error.s +++ b/lld/test/ELF/avr-reloc-error.s @@ -3,13 +3,13 @@ # RUN: rm -rf %t && split-file %s %t && cd %t # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-7.s -o avr-pcrel-7.o -# RUN: not ld.lld avr-pcrel-7.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \ +# RUN: not ld.lld avr-pcrel-7.o -Ttext=0x1000 --defsym=callee0=0x1040 --defsym=callee1=0x1084 --defsym=callee2=0x100f 2>&1 | \ # RUN: FileCheck %s --check-prefix=PCREL7 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-pcrel-13.s -o avr-pcrel-13.o -# RUN: not ld.lld avr-pcrel-13.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \ +# RUN: not ld.lld avr-pcrel-13.o -Ttext=0x1000 --defsym=callee0=0x2000 --defsym=callee1=0x2004 --defsym=callee2=0x100f 2>&1 | \ # RUN: FileCheck %s --check-prefix=PCREL13 # RUN: llvm-mc -filetype=obj -triple=avr -mcpu=atmega328 avr-abs.s -o avr-abs.o -# RUN: not ld.lld avr-abs.o -o /dev/null -Ttext=0x1000 --defsym=callee0=0x1009 --defsym=callee1=0x1010 2>&1 | \ +# RUN: not ld.lld avr-abs.o -Ttext=0x1000 --defsym=callee0=0x1009 --defsym=callee1=0x1010 2>&1 | \ # RUN: FileCheck %s --check-prefix=ABS #--- avr-pcrel-7.s diff --git a/lld/test/ELF/common-archive-lookup.s b/lld/test/ELF/common-archive-lookup.s index a30d0f18d01ad..9834d13ed7c24 100644 --- a/lld/test/ELF/common-archive-lookup.s +++ b/lld/test/ELF/common-archive-lookup.s @@ -69,7 +69,7 @@ # RUN: FileCheck --check-prefix=ASM %s < out.lto.s ## COMMON overrides weak. Don't extract 3.bc which provides a weak definition. -# RUN: ld.lld -o /dev/null main.o --start-lib 1.bc 3.bc --end-lib -y block | FileCheck --check-prefix=LTO_WEAK %s +# RUN: ld.lld main.o --start-lib 1.bc 3.bc --end-lib -y block | FileCheck --check-prefix=LTO_WEAK %s ## Old FORTRAN that mixes use of COMMON blocks and BLOCK DATA requires that we ## search through archives for non-tentative definitions (from the BLOCK DATA) diff --git a/lld/test/ELF/duplicated-synthetic-sym.s b/lld/test/ELF/duplicated-synthetic-sym.s index d08af3a1a52e5..9d47ec10f797f 100644 --- a/lld/test/ELF/duplicated-synthetic-sym.s +++ b/lld/test/ELF/duplicated-synthetic-sym.s @@ -1,14 +1,12 @@ // REQUIRES: x86 -// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: rm -rf %t.dir -// RUN: mkdir %t.dir -// RUN: cd %t.dir +// RUN: rm -rf %t && mkdir %t && cd %t +// RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o a.o // RUN: echo > file.bin -// RUN: not ld.lld %t.o --format=binary file.bin -o /dev/null 2>&1 | FileCheck %s -// RUN: not ld.lld %t.o --format binary file.bin -o /dev/null 2>&1 | FileCheck %s +// RUN: not ld.lld a.o --format=binary file.bin 2>&1 | FileCheck %s +// RUN: not ld.lld a.o --format binary file.bin 2>&1 | FileCheck %s -// CHECK: duplicate symbol: _binary_file_bin_start +// CHECK: error: duplicate symbol: _binary_file_bin_start // CHECK-NEXT: defined in {{.*}}.o // CHECK-NEXT: defined in file.bin diff --git a/lld/test/ELF/linkerscript/discard-section.s b/lld/test/ELF/linkerscript/discard-section.s index 0bbebac59bb34..d6dd8a5347e94 100644 --- a/lld/test/ELF/linkerscript/discard-section.s +++ b/lld/test/ELF/linkerscript/discard-section.s @@ -4,8 +4,8 @@ # RUN: rm -rf %t && split-file %s %t && cd %t # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o -# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -z undefs -o /dev/null 2>&1 | FileCheck %s --check-prefix=LOCAL --implicit-check-not=error: -# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -o /dev/null 2>&1 | FileCheck %s --check-prefixes=LOCAL,NONLOCAL --implicit-check-not=error: +# RUN: not ld.lld --threads=1 -T a.lds a.o b.o -z undefs 2>&1 | FileCheck %s --check-prefix=LOCAL --implicit-check-not=error: +# RUN: not ld.lld --threads=1 -T a.lds a.o b.o 2>&1 | FileCheck %s --check-prefixes=LOCAL,NONLOCAL --implicit-check-not=error: # RUN: ld.lld -r -T a.lds a.o b.o -o a.ro 2>&1 | FileCheck %s --check-prefix=WARNING --implicit-check-not=warning: # RUN: llvm-readelf -r -s a.ro | FileCheck %s --check-prefix=RELOC diff --git a/lld/test/ELF/linkerscript/input-relative.s b/lld/test/ELF/linkerscript/input-relative.s index 771684c7c4f82..3f81c5f3ee9e3 100644 --- a/lld/test/ELF/linkerscript/input-relative.s +++ b/lld/test/ELF/linkerscript/input-relative.s @@ -31,13 +31,13 @@ ## The rules does not apply to an absolute path. # RUN: echo 'INPUT(/libb.a)' > dir/absolute.lds -# RUN: not ld.lld a.o dir/absolute.lds -o /dev/null +# RUN: not ld.lld a.o dir/absolute.lds ## If the parent directory of the current linker script does not contain the file, ## fall back to the current working directory. # RUN: cp libb.a libc.a # RUN: echo 'INPUT(libc.a)' > dir/fallback.lds -# RUN: ld.lld a.o dir/fallback.lds -o /dev/null +# RUN: ld.lld a.o dir/fallback.lds .globl _start _start: diff --git a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test index c82a93efc1aae..7a18015cfcab4 100644 --- a/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test +++ b/lld/test/ELF/linkerscript/locationcountererr-arm-exidx.test @@ -5,7 +5,7 @@ ## If we don't merge adjacent duplicate entries, __code_size will be negative and ## . += __code_size will trigger a "move location counter backward" error. ## LLD may report more errors further down, but there is only one "move location counter backward" error. -# RUN: not ld.lld -z norelro -z max-page-size=4096 -T a.t a.o -o /dev/null --no-merge-exidx-entries 2>&1 | \ +# RUN: not ld.lld -z norelro -z max-page-size=4096 -T a.t a.o --no-merge-exidx-entries 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR --implicit-check-not=error: # ERR: error: a.t:9: unable to move location counter (0x1000) backward to 0xf6c for section 'dummy1' diff --git a/lld/test/ELF/lto/archive-mixed.test b/lld/test/ELF/lto/archive-mixed.test index fbb84a1d8bb76..6f1db87c89ca1 100644 --- a/lld/test/ELF/lto/archive-mixed.test +++ b/lld/test/ELF/lto/archive-mixed.test @@ -19,22 +19,22 @@ ; RUN: llvm-ar rc other.bc.a a.bc ; RUN: llvm-ar rc other.o.a a.o -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.bc.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.bc.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.bc.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.bc.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.o.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.o.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.bc.b.o.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.bc.b.o.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.bc.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.o.b.bc.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.bc.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.o.b.bc.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.o.a other.bc.a | \ +; RUN: ld.lld --trace ref.o a.o.b.o.a other.bc.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} ;; Not an LTO test case, but here for completeness. -; RUN: ld.lld -o /dev/null --trace ref.o a.o.b.o.a other.o.a | \ +; RUN: ld.lld --trace ref.o a.o.b.o.a other.o.a | \ ; RUN: FileCheck %s --implicit-check-not={{.}} ; CHECK: ref.o diff --git a/lld/test/ELF/lto/obj-path.ll b/lld/test/ELF/lto/obj-path.ll index c0bb4addf2466..bf291ff8a0458 100644 --- a/lld/test/ELF/lto/obj-path.ll +++ b/lld/test/ELF/lto/obj-path.ll @@ -54,14 +54,14 @@ ;; With --thinlto-index-only, --lto-obj-path= creates just one file. ; RUN: rm -f objpath.o objpath.o1 objpath.o2 -; RUN: ld.lld --thinlto-index-only --lto-obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null +; RUN: ld.lld --thinlto-index-only --lto-obj-path=objpath.o -shared 1.bc d/2.bc ; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY ; RUN: not ls objpath.o1 ; RUN: not ls objpath.o2 ;; Test --plugin-opt=obj-path=. ; RUN: rm -f objpath.o -; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null +; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc ; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY ;; Ensure lld emits empty combined module if specific obj-path. diff --git a/lld/test/ELF/lto/parallel.ll b/lld/test/ELF/lto/parallel.ll index 6b2c352b0a965..e32225c3ed3b8 100644 --- a/lld/test/ELF/lto/parallel.ll +++ b/lld/test/ELF/lto/parallel.ll @@ -5,7 +5,7 @@ ; RUN: llvm-nm out.lto.o | FileCheck --check-prefix=CHECK0 %s ; RUN: llvm-nm out.lto.1.o | FileCheck --check-prefix=CHECK1 %s -; RUN: not ld.lld --lto-partitions=0 a.bc -o /dev/null 2>&1 | FileCheck --check-prefix=INVALID %s +; RUN: not ld.lld --lto-partitions=0 a.bc 2>&1 | FileCheck --check-prefix=INVALID %s ; INVALID: --lto-partitions: number of threads must be > 0 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" diff --git a/lld/test/ELF/non-abs-reloc.s b/lld/test/ELF/non-abs-reloc.s index 42b5f8fec1c43..e37a0ec12414b 100644 --- a/lld/test/ELF/non-abs-reloc.s +++ b/lld/test/ELF/non-abs-reloc.s @@ -15,13 +15,13 @@ // DISASM-NEXT: 6: call{{.}} 0x5 /// There is currently no error for -r. See also https://github.com/ClangBuiltLinux/linux/issues/1937 -// RUN: ld.lld -T lds -r a.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=REL-R --implicit-check-not=warning: +// RUN: ld.lld -T lds -r a.o 2>&1 | FileCheck %s --check-prefix=REL-R --implicit-check-not=warning: // REL-R: warning: {{.*}}:(.nonalloc1+0xa): has non-ABS relocation R_386_PC32 against symbol '' // RUN: llvm-mc -filetype=obj -triple=x86_64 asm -o b.o // RUN: ld.lld -T lds b.o -o b 2>&1 | FileCheck %s --check-prefix=CHECK2 --implicit-check-not=warning: // RUN: llvm-objdump -D --no-show-raw-insn b | FileCheck --check-prefix=DISASM %s -// RUN: ld.lld -T lds -r b.o -o /dev/null --fatal-warnings +// RUN: ld.lld -T lds -r b.o --fatal-warnings // CHECK2: warning: {{.*}}.o:(.nonalloc1+0x1): has non-ABS relocation R_X86_64_PC32 against symbol '_start' // CHECK2-NEXT: warning: {{.*}}.o:(.nonalloc1+0x6): has non-ABS relocation R_X86_64_PC32 against symbol 'ifunc' // CHECK2-NEXT: warning: {{.*}}.o:(.nonalloc1+0xa): has non-ABS relocation R_X86_64_PC32 against symbol '' diff --git a/lld/test/ELF/print-archive-stats.s b/lld/test/ELF/print-archive-stats.s index 2dd236f8e0a1f..5116685623ce2 100644 --- a/lld/test/ELF/print-archive-stats.s +++ b/lld/test/ELF/print-archive-stats.s @@ -10,7 +10,7 @@ # RUN: llvm-ar rc 1.a 1.o 2.o 3.o # RUN: llvm-ar rc lib2.a -# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=a.txt -o /dev/null +# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=a.txt # RUN: FileCheck --input-file=a.txt -DT=%t %s --match-full-lines --strict-whitespace ## Fetches 0 member from %t/weak.a and 2 members from %t1.a @@ -20,10 +20,10 @@ # CHECK-NEXT:0 0 {{.*}}lib2.a ## - means stdout. -# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=- -o /dev/null | diff a.txt - +# RUN: ld.lld a.o %t/weak.a 1.a -L. --print-archive-stats=- | diff a.txt - ## The second 1.a has 0 fetched member. -# RUN: ld.lld a.o %t/weak.a -L. -l:1.a -l:1.a --print-archive-stats=- -o /dev/null | \ +# RUN: ld.lld a.o %t/weak.a -L. -l:1.a -l:1.a --print-archive-stats=- | \ # RUN: FileCheck --check-prefix=CHECK2 %s # CHECK2: members extracted archive # CHECK2-NEXT: 1 0 {{.*}}weak.a @@ -31,7 +31,7 @@ # CHECK2-NEXT: 3 0 {{.*}}1.a # CHECK2-NEXT: 0 0 {{.*}}lib2.a -# RUN: not ld.lld -shared a.o -L. --print-archive-stats=/ -o /dev/null 2>&1 | FileCheck --check-prefix=ERR %s +# RUN: not ld.lld -shared a.o -L. --print-archive-stats=/ 2>&1 | FileCheck --check-prefix=ERR %s # ERR: error: --print-archive-stats=: cannot open /: {{.*}} #--- a.s diff --git a/lld/test/ELF/remap-inputs.test b/lld/test/ELF/remap-inputs.test index 0f9cafa987ac9..1be01c792a37b 100644 --- a/lld/test/ELF/remap-inputs.test +++ b/lld/test/ELF/remap-inputs.test @@ -17,26 +17,26 @@ # REPRO-NEXT: d.so ## --remap-inputs can also be specified multiple times. -# RUN: ld.lld --remap-inputs 'aa.o=a.o' --remap-inputs='d[d].so=d.so' aa.o b.o c.a d.so -o /dev/null +# RUN: ld.lld --remap-inputs 'aa.o=a.o' --remap-inputs='d[d].so=d.so' aa.o b.o c.a d.so ## A multiple-to-one pattern may easily cause issues. Users should be careful. -# RUN: not ld.lld --remap-inputs-file=3.map aa.o bb.bc -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=3.map aa.o bb.bc 2>&1 | \ # RUN: FileCheck %s --check-prefix=DUPLICATE --implicit-check-not=error: # DUPLICATE: error: duplicate symbol: _start -# RUN: not ld.lld --remap-inputs-file=err1.map aa.o bb.bc -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=err1.map aa.o bb.bc 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR1 --implicit-check-not=error: # ERR1: error: err1.map:2: parse error, not 'from-glob=to-file' # ERR1-NEXT: error: cannot open bb.bc: {{.*}} -# RUN: not ld.lld --remap-inputs-file=err2.map aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs-file=err2.map aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR2 --implicit-check-not=error: # ERR2: error: err2.map:1: invalid glob pattern, unmatched '[': aa.[o # ERR2-NEXT: error: cannot open aa.o: {{.*}} -# RUN: not ld.lld --remap-inputs=aa.o aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs=aa.o aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR3 --implicit-check-not=error: -# RUN: not ld.lld --remap-inputs=aa.o= aa.o -o /dev/null 2>&1 | \ +# RUN: not ld.lld --remap-inputs=aa.o= aa.o 2>&1 | \ # RUN: FileCheck %s --check-prefix=ERR3 --implicit-check-not=error: # ERR3: error: --remap-inputs: parse error, not 'from-glob=to-file' # ERR3-NEXT: error: cannot open aa.o: {{.*}} diff --git a/lld/test/ELF/reproduce-deplibs.s b/lld/test/ELF/reproduce-deplibs.s index 06c25a2239834..48486d0e2bde7 100644 --- a/lld/test/ELF/reproduce-deplibs.s +++ b/lld/test/ELF/reproduce-deplibs.s @@ -8,7 +8,7 @@ # RUN: llvm-ar rc foo.a foo.o # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o bar.o -# RUN: ld.lld bar.o -o /dev/null --reproduce repro.tar +# RUN: ld.lld bar.o --reproduce repro.tar # RUN: tar tf repro.tar | FileCheck -DPATH='%:t.dir' %s # CHECK: [[PATH]]/foo.a diff --git a/lld/test/ELF/reproduce-lto.s b/lld/test/ELF/reproduce-lto.s index 36838f21388ef..b1a5bab122c56 100644 --- a/lld/test/ELF/reproduce-lto.s +++ b/lld/test/ELF/reproduce-lto.s @@ -5,10 +5,10 @@ # RUN: rm -rf %t.dir # RUN: mkdir -p %t.dir/build1 -# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.dir/build1/foo.o -# RUN: echo > %t.dir/build1/empty_profile.txt # RUN: cd %t.dir -# RUN: ld.lld build1/foo.o -o /dev/null --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o build1/foo.o +# RUN: echo > build1/empty_profile.txt +# RUN: ld.lld build1/foo.o --reproduce repro1.tar --lto-sample-profile=%t.dir/build1/empty_profile.txt # RUN: tar tvf repro1.tar | FileCheck %s --implicit-check-not={{.}} # CHECK-DAG: {{.*}} repro1/{{.*}}/empty_profile.txt diff --git a/lld/test/ELF/riscv-attributes.s b/lld/test/ELF/riscv-attributes.s index d003a298101cb..13b2c7a24d0b8 100644 --- a/lld/test/ELF/riscv-attributes.s +++ b/lld/test/ELF/riscv-attributes.s @@ -31,7 +31,7 @@ # RUN: llvm-readobj --arch-specific out3 | FileCheck %s --check-prefix=CHECK3 # RUN: llvm-mc -filetype=obj -triple=riscv64 invalid_arch1.s -o invalid_arch1.o -# RUN: not ld.lld invalid_arch1.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=INVALID_ARCH1 --implicit-check-not=error: +# RUN: not ld.lld invalid_arch1.o 2>&1 | FileCheck %s --check-prefix=INVALID_ARCH1 --implicit-check-not=error: # INVALID_ARCH1: error: invalid_arch1.o:(.riscv.attributes): rv64i2: extension lacks version in expected format ## A zero value attribute is not printed. @@ -41,20 +41,20 @@ ## Differing stack_align values lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 diff_stack_align.s -o diff_stack_align.o -# RUN: not ld.lld a.o b.o c.o diff_stack_align.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=STACK_ALIGN --implicit-check-not=error: +# RUN: not ld.lld a.o b.o c.o diff_stack_align.o 2>&1 | FileCheck %s --check-prefix=STACK_ALIGN --implicit-check-not=error: # STACK_ALIGN: error: diff_stack_align.o:(.riscv.attributes) has stack_align=32 but a.o:(.riscv.attributes) has stack_align=16 ## RISC-V tag merging for atomic_abi values A6C and A7 lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_A6C.s -o atomic_abi_A6C.o # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_A7.s -o atomic_abi_A7.o -# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_A7.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_ERROR --implicit-check-not=error: +# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_A7.o 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_ERROR --implicit-check-not=error: # ATOMIC_ABI_ERROR: error: atomic abi mismatch for .riscv.attributes # ATOMIC_ABI_ERROR-NEXT: >>> atomic_abi_A6C.o:(.riscv.attributes): atomic_abi=1 # ATOMIC_ABI_ERROR-NEXT: >>> atomic_abi_A7.o:(.riscv.attributes): atomic_abi=3 ## RISC-V tag merging for atomic_abi values A6C and invalid lead to an error. # RUN: llvm-mc -filetype=obj -triple=riscv64 atomic_abi_invalid.s -o atomic_abi_invalid.o -# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_invalid.o -o /dev/null 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_INVALID --implicit-check-not=error: +# RUN: not ld.lld atomic_abi_A6C.o atomic_abi_invalid.o 2>&1 | FileCheck %s --check-prefix=ATOMIC_ABI_INVALID --implicit-check-not=error: # ATOMIC_ABI_INVALID: error: unknown atomic abi for .riscv.attributes # ATOMIC_ABI_INVALID-NEXT: >>> atomic_abi_invalid.o:(.riscv.attributes): atomic_abi=42 diff --git a/lld/test/ELF/unknown-section.test b/lld/test/ELF/unknown-section.test index f6ecca29a22ae..faf420e1fb5c4 100644 --- a/lld/test/ELF/unknown-section.test +++ b/lld/test/ELF/unknown-section.test @@ -1,6 +1,6 @@ # RUN: rm -rf %t && mkdir %t && cd %t # RUN: yaml2obj %s -o a.o -# RUN: not ld.lld a.o -o /dev/null 2>&1 | FileCheck %s --implicit-check-not=error: +# RUN: not ld.lld a.o 2>&1 | FileCheck %s --implicit-check-not=error: # CHECK: error: a.o:(relr): unknown section type 0x13 # CHECK-NEXT: error: a.o:(regular): unknown section type 0x15 diff --git a/lld/test/ELF/why-extract.s b/lld/test/ELF/why-extract.s index a41db8d9fd49a..3235bce5a7167 100644 --- a/lld/test/ELF/why-extract.s +++ b/lld/test/ELF/why-extract.s @@ -12,18 +12,18 @@ # RUN: cd %t ## Nothing is extracted from an archive. The file is created with just a header. -# RUN: ld.lld main.o a.o b.a -o /dev/null --why-extract=why1.txt +# RUN: ld.lld main.o a.o b.a --why-extract=why1.txt # RUN: FileCheck %s --input-file=why1.txt --check-prefix=CHECK1 --match-full-lines --strict-whitespace # CHECK1:reference extracted symbol # CHECK1-NOT:{{.}} ## Some archive members are extracted. -# RUN: ld.lld main.o a_b.a b.a -o /dev/null --why-extract=why2.txt +# RUN: ld.lld main.o a_b.a b.a --why-extract=why2.txt # RUN: FileCheck %s --input-file=why2.txt --check-prefix=CHECK2 --match-full-lines --strict-whitespace ## A relocation error does not suppress the output. -# RUN: rm -f why2.txt && not ld.lld main.o a_b.a b.a err.o -o /dev/null --why-extract=why2.txt +# RUN: rm -f why2.txt && not ld.lld main.o a_b.a b.a err.o --why-extract=why2.txt # RUN: FileCheck %s --input-file=why2.txt --check-prefix=CHECK2 --match-full-lines --strict-whitespace # CHECK2:reference extracted symbol @@ -31,12 +31,12 @@ # CHECK2-NEXT:a_b.a(a_b.o) b.a(b.o) b() ## An undefined symbol error does not suppress the output. -# RUN: not ld.lld main.o a_b.a -o /dev/null --why-extract=why3.txt +# RUN: not ld.lld main.o a_b.a --why-extract=why3.txt # RUN: FileCheck %s --input-file=why3.txt --check-prefix=CHECK3 --match-full-lines --strict-whitespace ## Check that backward references are supported. ## - means stdout. -# RUN: ld.lld b.a a_b.a main.o -o /dev/null --why-extract=- | FileCheck %s --check-prefix=CHECK4 +# RUN: ld.lld b.a a_b.a main.o --why-extract=- | FileCheck %s --check-prefix=CHECK4 # CHECK3:reference extracted symbol # CHECK3-NEXT:main.o a_b.a(a_b.o) a @@ -45,34 +45,34 @@ # CHECK4-NEXT:a_b.a(a_b.o) b.a(b.o) b() # CHECK4-NEXT:main.o a_b.a(a_b.o) a -# RUN: ld.lld main.o a_b.a b.a -o /dev/null --no-demangle --why-extract=- | FileCheck %s --check-prefix=MANGLED +# RUN: ld.lld main.o a_b.a b.a --no-demangle --why-extract=- | FileCheck %s --check-prefix=MANGLED # MANGLED: a_b.a(a_b.o) b.a(b.o) _Z1bv -# RUN: ld.lld main.o a.a b.a -o /dev/null -u _Z1bv --why-extract=- | FileCheck %s --check-prefix=UNDEFINED +# RUN: ld.lld main.o a.a b.a -u _Z1bv --why-extract=- | FileCheck %s --check-prefix=UNDEFINED ## We insert -u symbol before processing other files, so its name is . ## This is not ideal. # UNDEFINED: b.a(b.o) b() -# RUN: ld.lld main.o a.a b.a -o /dev/null --undefined-glob '_Z1b*' --why-extract=- | FileCheck %s --check-prefix=UNDEFINED_GLOB +# RUN: ld.lld main.o a.a b.a --undefined-glob '_Z1b*' --why-extract=- | FileCheck %s --check-prefix=UNDEFINED_GLOB # UNDEFINED_GLOB: --undefined-glob b.a(b.o) b() -# RUN: ld.lld main.o a.a b.a -o /dev/null -e _Z1bv --why-extract=- | FileCheck %s --check-prefix=ENTRY +# RUN: ld.lld main.o a.a b.a -e _Z1bv --why-extract=- | FileCheck %s --check-prefix=ENTRY # ENTRY: --entry b.a(b.o) b() -# RUN: ld.lld main.o b.a -o /dev/null -T a.lds --why-extract=- | FileCheck %s --check-prefix=SCRIPT +# RUN: ld.lld main.o b.a -T a.lds --why-extract=- | FileCheck %s --check-prefix=SCRIPT # SCRIPT: b.a(b.o) b() -# RUN: ld.lld main.o --start-lib a_b.o b.o --end-lib -o /dev/null --why-extract=- | FileCheck %s --check-prefix=LAZY +# RUN: ld.lld main.o --start-lib a_b.o b.o --end-lib --why-extract=- | FileCheck %s --check-prefix=LAZY # LAZY: main.o a_b.o a # LAZY: a_b.o b.o b() -# RUN: not ld.lld -shared main.o -o /dev/null --why-extract=/ 2>&1 | FileCheck %s --check-prefix=ERR +# RUN: not ld.lld -shared main.o --why-extract=/ 2>&1 | FileCheck %s --check-prefix=ERR # ERR: error: cannot open --why-extract= file /: {{.*}} diff --git a/lld/test/wasm/lto/Inputs/libcall-return-addr.ll b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll new file mode 100644 index 0000000000000..271bdae11e49d --- /dev/null +++ b/lld/test/wasm/lto/Inputs/libcall-return-addr.ll @@ -0,0 +1,6 @@ +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-emscripten" + +define ptr @emscripten_return_address() { + ret ptr null +} diff --git a/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll b/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll deleted file mode 100644 index 1439d7f8b4cb4..0000000000000 --- a/lld/test/wasm/lto/Inputs/libcall-truncsfhf2.ll +++ /dev/null @@ -1,6 +0,0 @@ -target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" -target triple = "wasm32-unknown-unknown" - -define half @__truncsfhf2(float) { - ret half 0.0 -} diff --git a/lld/test/wasm/lto/libcall-return-addr.ll b/lld/test/wasm/lto/libcall-return-addr.ll new file mode 100644 index 0000000000000..74eba74f97018 --- /dev/null +++ b/lld/test/wasm/lto/libcall-return-addr.ll @@ -0,0 +1,18 @@ +; RUN: llvm-as %s -o %t.o +; RUN: llvm-as %p/Inputs/libcall-return-addr.ll -o %t.return-addr.o +; RUN: rm -f %t.a +; RUN: llvm-ar rcs %t.a %t.return-addr.o +; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s + +target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-i128:128-f128:64-n32:64-S128-ni:1:10:20" +target triple = "wasm32-unknown-emscripten" + +@g_ptr = global ptr null + +define void @_start() { + %addr = call ptr @llvm.returnaddress(i32 1) + store ptr %addr, ptr @g_ptr + ret void +} + +; CHECK: wasm-ld: error: {{.*}}return-addr.o): attempt to add bitcode file after LTO (emscripten_return_address) diff --git a/lld/test/wasm/lto/libcall-truncsfhf2.ll b/lld/test/wasm/lto/libcall-truncsfhf2.ll deleted file mode 100644 index fd07bb53890f6..0000000000000 --- a/lld/test/wasm/lto/libcall-truncsfhf2.ll +++ /dev/null @@ -1,20 +0,0 @@ -; RUN: llvm-as %s -o %t.o -; RUN: llvm-as %p/Inputs/libcall-truncsfhf2.ll -o %t.truncsfhf2.o -; RUN: rm -f %t.a -; RUN: llvm-ar rcs %t.a %t.truncsfhf2.o -; RUN: not wasm-ld --export-all %t.o %t.a -o %t.wasm 2>&1 | FileCheck %s - -target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20" -target triple = "wasm32-unknown-unknown" - -@g_float = global float 0.0 -@g_half = global half 0.0 - -define void @_start() { - %val1 = load float, ptr @g_float - %v0 = fptrunc float %val1 to half - store half %v0, ptr @g_half - ret void -} - -; CHECK: wasm-ld: error: {{.*}}truncsfhf2.o): attempt to add bitcode file after LTO (__truncsfhf2) diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h index d7751ca045bb2..7f08f3dd26106 100644 --- a/lldb/include/lldb/Core/Debugger.h +++ b/lldb/include/lldb/Core/Debugger.h @@ -131,17 +131,13 @@ class Debugger : public std::enable_shared_from_this, void SetAsyncExecution(bool async); - lldb::FileSP GetInputFileSP() { return m_input_file_sp; } - - lldb::StreamFileSP GetOutputStreamSP() { return m_output_stream_sp; } - - lldb::StreamFileSP GetErrorStreamSP() { return m_error_stream_sp; } - File &GetInputFile() { return *m_input_file_sp; } - File &GetOutputFile() { return m_output_stream_sp->GetFile(); } + lldb::FileSP GetInputFileSP() { return m_input_file_sp; } + + lldb::FileSP GetOutputFileSP() { return m_output_stream_sp->GetFileSP(); } - File &GetErrorFile() { return m_error_stream_sp->GetFile(); } + lldb::FileSP GetErrorFileSP() { return m_error_stream_sp->GetFileSP(); } repro::DataRecorder *GetInputRecorder(); @@ -649,6 +645,14 @@ class Debugger : public std::enable_shared_from_this, void PrintProgress(const ProgressEventData &data); + /// Except for Debugger and IOHandler, GetOutputStreamSP and GetErrorStreamSP + /// should not be used directly. Use GetAsyncOutputStream and + /// GetAsyncErrorStream instead. + /// @{ + lldb::StreamFileSP GetOutputStreamSP() { return m_output_stream_sp; } + lldb::StreamFileSP GetErrorStreamSP() { return m_error_stream_sp; } + /// @} + void PushIOHandler(const lldb::IOHandlerSP &reader_sp, bool cancel_top_handler = true); diff --git a/lldb/include/lldb/Core/StreamAsynchronousIO.h b/lldb/include/lldb/Core/StreamAsynchronousIO.h index b7adbc42096ce..7ae65757e2d73 100644 --- a/lldb/include/lldb/Core/StreamAsynchronousIO.h +++ b/lldb/include/lldb/Core/StreamAsynchronousIO.h @@ -18,9 +18,17 @@ namespace lldb_private { class Debugger; +/// A stream meant for asynchronously printing output. Output is buffered until +/// the stream is flushed or destroyed. Printing is handled by the currently +/// active IOHandler, or the debugger's output or error stream if there is none. class StreamAsynchronousIO : public Stream { public: - StreamAsynchronousIO(Debugger &debugger, bool for_stdout, bool colors); + enum ForSTDOUT : bool { + STDOUT = true, + STDERR = false, + }; + + StreamAsynchronousIO(Debugger &debugger, ForSTDOUT for_stdout); ~StreamAsynchronousIO() override; @@ -32,7 +40,7 @@ class StreamAsynchronousIO : public Stream { private: Debugger &m_debugger; std::string m_data; - bool m_for_stdout; + ForSTDOUT m_for_stdout; }; } // namespace lldb_private diff --git a/lldb/include/lldb/Symbol/LineTable.h b/lldb/include/lldb/Symbol/LineTable.h index 6d158ab518879..f66081b6ee110 100644 --- a/lldb/include/lldb/Symbol/LineTable.h +++ b/lldb/include/lldb/Symbol/LineTable.h @@ -102,6 +102,19 @@ class LineTable { void GetDescription(Stream *s, Target *target, lldb::DescriptionLevel level); + /// Helper function for line table iteration. \c lower_bound returns the index + /// of the first line entry which ends after the given address (i.e., the + /// first entry which contains the given address or it comes after it). + /// \c upper_bound returns the index of the first line entry which begins on + /// or after the given address (i.e., the entry which would come after the + /// entry containing the given address, if such an entry exists). Functions + /// return GetSize() if there is no such entry. The functions are + /// most useful in combination: iterating from lower_bound(a) to + /// upper_bound(b) returns all line tables which intersect the half-open + /// range [a,b). + uint32_t lower_bound(const Address &so_addr) const; + uint32_t upper_bound(const Address &so_addr) const; + /// Find a line entry that contains the section offset address \a so_addr. /// /// \param[in] so_addr diff --git a/lldb/include/lldb/Target/ThreadPlanTracer.h b/lldb/include/lldb/Target/ThreadPlanTracer.h index a6fd2f031dc22..7c45e213f94f1 100644 --- a/lldb/include/lldb/Target/ThreadPlanTracer.h +++ b/lldb/include/lldb/Target/ThreadPlanTracer.h @@ -56,7 +56,7 @@ class ThreadPlanTracer { Process &m_process; lldb::tid_t m_tid; - Stream *GetLogStream(); + lldb::StreamSP GetLogStreamSP(); virtual void Log(); diff --git a/lldb/source/API/SBDebugger.cpp b/lldb/source/API/SBDebugger.cpp index bf19d2ff8333c..e646b09e05852 100644 --- a/lldb/source/API/SBDebugger.cpp +++ b/lldb/source/API/SBDebugger.cpp @@ -509,14 +509,14 @@ SBFile SBDebugger::GetInputFile() { FILE *SBDebugger::GetOutputFileHandle() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return m_opaque_sp->GetOutputStreamSP()->GetFile().GetStream(); + return m_opaque_sp->GetOutputFileSP()->GetStream(); return nullptr; } SBFile SBDebugger::GetOutputFile() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return SBFile(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); + return SBFile(m_opaque_sp->GetOutputFileSP()); return SBFile(); } @@ -524,7 +524,7 @@ FILE *SBDebugger::GetErrorFileHandle() { LLDB_INSTRUMENT_VA(this); if (m_opaque_sp) - return m_opaque_sp->GetErrorStreamSP()->GetFile().GetStream(); + return m_opaque_sp->GetErrorFileSP()->GetStream(); return nullptr; } @@ -532,7 +532,7 @@ SBFile SBDebugger::GetErrorFile() { LLDB_INSTRUMENT_VA(this); SBFile file; if (m_opaque_sp) - return SBFile(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); + return SBFile(m_opaque_sp->GetErrorFileSP()); return SBFile(); } @@ -573,8 +573,8 @@ void SBDebugger::HandleCommand(const char *command) { sb_interpreter.HandleCommand(command, result, false); - result.PutError(m_opaque_sp->GetErrorStreamSP()->GetFileSP()); - result.PutOutput(m_opaque_sp->GetOutputStreamSP()->GetFileSP()); + result.PutError(m_opaque_sp->GetErrorFileSP()); + result.PutOutput(m_opaque_sp->GetOutputFileSP()); if (!m_opaque_sp->GetAsyncExecution()) { SBProcess process(GetCommandInterpreter().GetProcess()); diff --git a/lldb/source/Commands/CommandObjectGUI.cpp b/lldb/source/Commands/CommandObjectGUI.cpp index b56e49b073b03..8630171bae9d1 100644 --- a/lldb/source/Commands/CommandObjectGUI.cpp +++ b/lldb/source/Commands/CommandObjectGUI.cpp @@ -28,10 +28,10 @@ void CommandObjectGUI::DoExecute(Args &args, CommandReturnObject &result) { #if LLDB_ENABLE_CURSES Debugger &debugger = GetDebugger(); - File &input = debugger.GetInputFile(); - File &output = debugger.GetOutputFile(); - if (input.GetStream() && output.GetStream() && input.GetIsRealTerminal() && - input.GetIsInteractive()) { + FileSP input_sp = debugger.GetInputFileSP(); + FileSP output_sp = debugger.GetOutputFileSP(); + if (input_sp->GetStream() && output_sp->GetStream() && + input_sp->GetIsRealTerminal() && input_sp->GetIsInteractive()) { IOHandlerSP io_handler_sp(new IOHandlerCursesGUI(debugger)); if (io_handler_sp) debugger.RunIOHandlerAsync(io_handler_sp); diff --git a/lldb/source/Commands/CommandObjectLog.cpp b/lldb/source/Commands/CommandObjectLog.cpp index 5fb2dfaab8de0..17efae189b05e 100644 --- a/lldb/source/Commands/CommandObjectLog.cpp +++ b/lldb/source/Commands/CommandObjectLog.cpp @@ -394,7 +394,8 @@ class CommandObjectLogDump : public CommandObjectParsed { (*file)->GetDescriptor(), /*shouldClose=*/true); } else { stream_up = std::make_unique( - GetDebugger().GetOutputFile().GetDescriptor(), /*shouldClose=*/false); + GetDebugger().GetOutputFileSP()->GetDescriptor(), + /*shouldClose=*/false); } const std::string channel = std::string(args[0].ref()); diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp index 18569e155b517..8b7814d434ee9 100644 --- a/lldb/source/Core/Debugger.cpp +++ b/lldb/source/Core/Debugger.cpp @@ -947,7 +947,7 @@ Debugger::Debugger(lldb::LogOutputCallback log_callback, void *baton) if (term && !strcmp(term, "dumb")) SetUseColor(false); // Turn off use-color if we don't write to a terminal with color support. - if (!GetOutputFile().GetIsTerminalWithColors()) + if (!GetOutputFileSP()->GetIsTerminalWithColors()) SetUseColor(false); if (Diagnostics::Enabled()) { @@ -1321,11 +1321,13 @@ bool Debugger::PopIOHandler(const IOHandlerSP &pop_reader_sp) { } StreamSP Debugger::GetAsyncOutputStream() { - return std::make_shared(*this, true, GetUseColor()); + return std::make_shared(*this, + StreamAsynchronousIO::STDOUT); } StreamSP Debugger::GetAsyncErrorStream() { - return std::make_shared(*this, false, GetUseColor()); + return std::make_shared(*this, + StreamAsynchronousIO::STDERR); } void Debugger::RequestInterrupt() { @@ -1678,7 +1680,7 @@ bool Debugger::EnableLog(llvm::StringRef channel, LLDB_LOG_OPTION_PREPEND_TIMESTAMP | LLDB_LOG_OPTION_PREPEND_THREAD_NAME; } else if (log_file.empty()) { log_handler_sp = - CreateLogHandler(log_handler_kind, GetOutputFile().GetDescriptor(), + CreateLogHandler(log_handler_kind, GetOutputFileSP()->GetDescriptor(), /*should_close=*/false, buffer_size); } else { auto pos = m_stream_handlers.find(log_file); @@ -2111,8 +2113,8 @@ void Debugger::HandleProgressEvent(const lldb::EventSP &event_sp) { // Determine whether the current output file is an interactive terminal with // color support. We assume that if we support ANSI escape codes we support // vt100 escape codes. - File &file = GetOutputFile(); - if (!file.GetIsInteractive() || !file.GetIsTerminalWithColors()) + FileSP file_sp = GetOutputFileSP(); + if (!file_sp->GetIsInteractive() || !file_sp->GetIsTerminalWithColors()) return; StreamSP output = GetAsyncOutputStream(); diff --git a/lldb/source/Core/StreamAsynchronousIO.cpp b/lldb/source/Core/StreamAsynchronousIO.cpp index c2c64b61ab726..dbd56a69675b4 100644 --- a/lldb/source/Core/StreamAsynchronousIO.cpp +++ b/lldb/source/Core/StreamAsynchronousIO.cpp @@ -14,20 +14,20 @@ using namespace lldb; using namespace lldb_private; -StreamAsynchronousIO::StreamAsynchronousIO(Debugger &debugger, bool for_stdout, - bool colors) - : Stream(0, 4, eByteOrderBig, colors), m_debugger(debugger), m_data(), - m_for_stdout(for_stdout) {} +StreamAsynchronousIO::StreamAsynchronousIO( + Debugger &debugger, StreamAsynchronousIO::ForSTDOUT for_stdout) + : Stream(0, 4, eByteOrderBig, debugger.GetUseColor()), m_debugger(debugger), + m_data(), m_for_stdout(for_stdout) {} StreamAsynchronousIO::~StreamAsynchronousIO() { - // Flush when we destroy to make sure we display the data + // Flush when we destroy to make sure we display the data. Flush(); } void StreamAsynchronousIO::Flush() { if (!m_data.empty()) { m_debugger.PrintAsync(m_data.data(), m_data.size(), m_for_stdout); - m_data = std::string(); + m_data.clear(); } } diff --git a/lldb/source/Interpreter/CommandInterpreter.cpp b/lldb/source/Interpreter/CommandInterpreter.cpp index acdec84a1689b..5346d5a2d162a 100644 --- a/lldb/source/Interpreter/CommandInterpreter.cpp +++ b/lldb/source/Interpreter/CommandInterpreter.cpp @@ -2837,8 +2837,8 @@ void CommandInterpreter::HandleCommandsFromFile( } if (flags & eHandleCommandFlagPrintResult) { - debugger.GetOutputFile().Printf("Executing commands in '%s'.\n", - cmd_file_path.c_str()); + debugger.GetOutputFileSP()->Printf("Executing commands in '%s'.\n", + cmd_file_path.c_str()); } // Used for inheriting the right settings when "command source" might diff --git a/lldb/source/Interpreter/ScriptInterpreter.cpp b/lldb/source/Interpreter/ScriptInterpreter.cpp index 8d10e5de01225..a392d5777a021 100644 --- a/lldb/source/Interpreter/ScriptInterpreter.cpp +++ b/lldb/source/Interpreter/ScriptInterpreter.cpp @@ -245,8 +245,8 @@ ScriptInterpreterIORedirect::ScriptInterpreterIORedirect( if (outfile_handle) ::setbuf(outfile_handle, nullptr); - result->SetImmediateOutputFile(debugger.GetOutputStreamSP()->GetFileSP()); - result->SetImmediateErrorFile(debugger.GetErrorStreamSP()->GetFileSP()); + result->SetImmediateOutputFile(debugger.GetOutputFileSP()); + result->SetImmediateErrorFile(debugger.GetErrorFileSP()); } } diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp index cff44b588e26e..1d4cda6c046b7 100644 --- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp +++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp @@ -1193,7 +1193,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { m_kext_summary_header.version = data.GetU32(&offset); if (m_kext_summary_header.version > 128) { lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable version number %u\n", m_kext_summary_header.version); @@ -1208,7 +1208,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { // If we get an improbably large entry_size, we're probably // getting bad memory. lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable entry_size %u\n", m_kext_summary_header.entry_size); @@ -1226,7 +1226,7 @@ bool DynamicLoaderDarwinKernel::ReadKextSummaryHeader() { // If we get an improbably large number of kexts, we're probably // getting bad memory. lldb::StreamSP s = - m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); s->Printf("WARNING: Unable to read kext summary header, got " "improbable number of kexts %u\n", m_kext_summary_header.entry_count); @@ -1330,7 +1330,8 @@ bool DynamicLoaderDarwinKernel::ParseKextSummaries( number_of_old_kexts_being_removed == 0) return true; - lldb::StreamSP s = m_process->GetTarget().GetDebugger().GetOutputStreamSP(); + lldb::StreamSP s = + m_process->GetTarget().GetDebugger().GetAsyncOutputStream(); if (load_kexts) { if (number_of_new_kexts_being_added > 0 && number_of_old_kexts_being_removed > 0) { diff --git a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp index 8c2700cf21de9..c2db3540a797b 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/UBSan/InstrumentationRuntimeUBSan.cpp @@ -116,8 +116,6 @@ StructuredData::ObjectSP InstrumentationRuntimeUBSan::RetrieveReportData( if (!frame_sp) return StructuredData::ObjectSP(); - StreamFileSP Stream = target.GetDebugger().GetOutputStreamSP(); - EvaluateExpressionOptions options; options.SetUnwindOnError(true); options.SetTryAllThreads(true); diff --git a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp index 74e0fa7d49f82..d61c59776eee6 100644 --- a/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp +++ b/lldb/source/Plugins/InstrumentationRuntime/Utility/ReportRetriever.cpp @@ -210,8 +210,8 @@ bool ReportRetriever::NotifyBreakpointHit(ProcessSP process_sp, InstrumentationRuntimeStopInfo::CreateStopReasonWithInstrumentationData( *thread_sp, description, report)); - if (StreamFileSP stream_sp = StreamFileSP( - process_sp->GetTarget().GetDebugger().GetOutputStreamSP())) + if (StreamSP stream_sp = + process_sp->GetTarget().GetDebugger().GetAsyncOutputStream()) stream_sp->Printf("AddressSanitizer report breakpoint hit. Use 'thread " "info -s' to get extended information about the " "report.\n"); diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp index 7e8eee9f5aa4f..6d028e324ee4e 100644 --- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp +++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp @@ -45,8 +45,8 @@ class IOHandlerLuaInterpreter : public IOHandlerDelegate, m_script_interpreter(script_interpreter), m_active_io_handler(active_io_handler) { llvm::cantFail(m_script_interpreter.GetLua().ChangeIO( - debugger.GetOutputFile().GetStream(), - debugger.GetErrorFile().GetStream())); + debugger.GetOutputFileSP()->GetStream(), + debugger.GetErrorFileSP()->GetStream())); llvm::cantFail(m_script_interpreter.EnterSession(debugger.GetID())); } diff --git a/lldb/source/Symbol/LineTable.cpp b/lldb/source/Symbol/LineTable.cpp index 3d2afcdd11997..aae4ab59ff156 100644 --- a/lldb/source/Symbol/LineTable.cpp +++ b/lldb/source/Symbol/LineTable.cpp @@ -123,7 +123,7 @@ void LineTable::InsertSequence(LineSequence *sequence) { entry_collection::iterator end_pos = m_entries.end(); LineTable::Entry::LessThanBinaryPredicate less_than_bp(this); entry_collection::iterator pos = - upper_bound(begin_pos, end_pos, entry, less_than_bp); + std::upper_bound(begin_pos, end_pos, entry, less_than_bp); // We should never insert a sequence in the middle of another sequence if (pos != begin_pos) { @@ -185,6 +185,48 @@ bool LineTable::GetLineEntryAtIndex(uint32_t idx, LineEntry &line_entry) { return false; } +uint32_t LineTable::lower_bound(const Address &so_addr) const { + if (so_addr.GetModule() != m_comp_unit->GetModule()) + return GetSize(); + + Entry search_entry; + search_entry.file_addr = so_addr.GetFileAddress(); + if (search_entry.file_addr == LLDB_INVALID_ADDRESS) + return GetSize(); + + // This is not a typo. upper_bound returns the first entry which definitely + // does not contain this address, which means the entry before it *might* + // contain it -- if it is not a termination entry. + auto pos = + llvm::upper_bound(m_entries, search_entry, Entry::EntryAddressLessThan); + + if (pos != m_entries.begin() && !std::prev(pos)->is_terminal_entry) + --pos; + + return std::distance(m_entries.begin(), pos); +} + +uint32_t LineTable::upper_bound(const Address &so_addr) const { + if (so_addr.GetModule() != m_comp_unit->GetModule()) + return GetSize(); + + Entry search_entry; + search_entry.file_addr = so_addr.GetFileAddress(); + if (search_entry.file_addr == LLDB_INVALID_ADDRESS) + return GetSize(); + + // This is not a typo. lower_bound returns the first entry which starts on or + // after the given address, which is exactly what we want -- *except* if the + // entry is a termination entry (in that case, we want the one after it). + auto pos = + llvm::lower_bound(m_entries, search_entry, Entry::EntryAddressLessThan); + if (pos != m_entries.end() && pos->file_addr == search_entry.file_addr && + pos->is_terminal_entry) + ++pos; + + return std::distance(m_entries.begin(), pos); +} + bool LineTable::FindLineEntryByAddress(const Address &so_addr, LineEntry &line_entry, uint32_t *index_ptr) { @@ -199,7 +241,7 @@ bool LineTable::FindLineEntryByAddress(const Address &so_addr, if (search_entry.file_addr != LLDB_INVALID_ADDRESS) { entry_collection::const_iterator begin_pos = m_entries.begin(); entry_collection::const_iterator end_pos = m_entries.end(); - entry_collection::const_iterator pos = lower_bound( + entry_collection::const_iterator pos = std::lower_bound( begin_pos, end_pos, search_entry, Entry::EntryAddressLessThan); if (pos != end_pos) { if (pos != begin_pos) { diff --git a/lldb/source/Target/ThreadPlanTracer.cpp b/lldb/source/Target/ThreadPlanTracer.cpp index a119bf8589279..ab63cc7f6c223 100644 --- a/lldb/source/Target/ThreadPlanTracer.cpp +++ b/lldb/source/Target/ThreadPlanTracer.cpp @@ -27,6 +27,7 @@ #include "lldb/Utility/LLDBLog.h" #include "lldb/Utility/Log.h" #include "lldb/Utility/State.h" +#include "lldb/lldb-forward.h" using namespace lldb; using namespace lldb_private; @@ -41,13 +42,13 @@ ThreadPlanTracer::ThreadPlanTracer(Thread &thread) : m_process(*thread.GetProcess().get()), m_tid(thread.GetID()), m_enabled(false), m_stream_sp(), m_thread(nullptr) {} -Stream *ThreadPlanTracer::GetLogStream() { +StreamSP ThreadPlanTracer::GetLogStreamSP() { if (m_stream_sp) - return m_stream_sp.get(); + return m_stream_sp; else { TargetSP target_sp(GetThread().CalculateTarget()); if (target_sp) - return target_sp->GetDebugger().GetOutputStreamSP().get(); + return target_sp->GetDebugger().GetAsyncOutputStream(); } return nullptr; } @@ -65,12 +66,11 @@ void ThreadPlanTracer::Log() { bool show_frame_index = false; bool show_fullpaths = false; - Stream *stream = GetLogStream(); - if (stream) { - GetThread().GetStackFrameAtIndex(0)->Dump(stream, show_frame_index, + if (StreamSP stream_sp = GetLogStreamSP()) { + GetThread().GetStackFrameAtIndex(0)->Dump(stream_sp.get(), show_frame_index, show_fullpaths); - stream->Printf("\n"); - stream->Flush(); + stream_sp->Printf("\n"); + stream_sp->Flush(); } } @@ -129,9 +129,9 @@ void ThreadPlanAssemblyTracer::TracingStarted() { void ThreadPlanAssemblyTracer::TracingEnded() { m_register_values.clear(); } void ThreadPlanAssemblyTracer::Log() { - Stream *stream = GetLogStream(); + StreamSP stream_sp = GetLogStreamSP(); - if (!stream) + if (!stream_sp) return; RegisterContext *reg_ctx = GetThread().GetRegisterContext().get(); @@ -142,9 +142,10 @@ void ThreadPlanAssemblyTracer::Log() { uint8_t buffer[16] = {0}; // Must be big enough for any single instruction addr_valid = m_process.GetTarget().ResolveLoadAddress(pc, pc_addr); - pc_addr.Dump(stream, &GetThread(), Address::DumpStyleResolvedDescription, + pc_addr.Dump(stream_sp.get(), &GetThread(), + Address::DumpStyleResolvedDescription, Address::DumpStyleModuleWithFileAddress); - stream->PutCString(" "); + stream_sp->PutCString(" "); Disassembler *disassembler = GetDisassembler(); if (disassembler) { @@ -175,7 +176,7 @@ void ThreadPlanAssemblyTracer::Log() { instruction_list.GetInstructionAtIndex(0).get(); const FormatEntity::Entry *disassemble_format = m_process.GetTarget().GetDebugger().GetDisassemblyFormat(); - instruction->Dump(stream, max_opcode_byte_size, show_address, + instruction->Dump(stream_sp.get(), max_opcode_byte_size, show_address, show_bytes, show_control_flow_kind, nullptr, nullptr, nullptr, disassemble_format, 0); } @@ -198,12 +199,12 @@ void ThreadPlanAssemblyTracer::Log() { if (abi->GetArgumentValues(GetThread(), value_list)) { for (int arg_index = 0; arg_index < num_args; ++arg_index) { - stream->Printf( + stream_sp->Printf( "\n\targ[%d]=%llx", arg_index, value_list.GetValueAtIndex(arg_index)->GetScalar().ULongLong()); if (arg_index + 1 < num_args) - stream->PutCString(", "); + stream_sp->PutCString(", "); } } } @@ -222,14 +223,14 @@ void ThreadPlanAssemblyTracer::Log() { if (m_register_values[reg_num].GetType() == RegisterValue::eTypeInvalid || reg_value != m_register_values[reg_num]) { if (reg_value.GetType() != RegisterValue::eTypeInvalid) { - stream->PutCString("\n\t"); - DumpRegisterValue(reg_value, *stream, *reg_info, true, false, + stream_sp->PutCString("\n\t"); + DumpRegisterValue(reg_value, *stream_sp, *reg_info, true, false, eFormatDefault); } } m_register_values[reg_num] = reg_value; } } - stream->EOL(); - stream->Flush(); + stream_sp->EOL(); + stream_sp->Flush(); } diff --git a/lldb/unittests/Symbol/CMakeLists.txt b/lldb/unittests/Symbol/CMakeLists.txt index e1d24357e33db..ab5cecd101833 100644 --- a/lldb/unittests/Symbol/CMakeLists.txt +++ b/lldb/unittests/Symbol/CMakeLists.txt @@ -1,5 +1,6 @@ add_lldb_unittest(SymbolTests JSONSymbolTest.cpp + LineTableTest.cpp LocateSymbolFileTest.cpp MangledTest.cpp PostfixExpressionTest.cpp diff --git a/lldb/unittests/Symbol/LineTableTest.cpp b/lldb/unittests/Symbol/LineTableTest.cpp new file mode 100644 index 0000000000000..2fa2913f67f9e --- /dev/null +++ b/lldb/unittests/Symbol/LineTableTest.cpp @@ -0,0 +1,285 @@ +//===-- LineTableTest.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "Plugins/ObjectFile/ELF/ObjectFileELF.h" +#include "TestingSupport/SubsystemRAII.h" +#include "TestingSupport/TestUtilities.h" +#include "lldb/Core/PluginManager.h" +#include "lldb/Symbol/CompileUnit.h" +#include "lldb/Symbol/SymbolFile.h" +#include "gtest/gtest.h" +#include + +using namespace lldb; +using namespace llvm; +using namespace lldb_private; + +namespace { + +// A fake symbol file class to allow us to create the line table "the right +// way". Pretty much all methods except for GetCompileUnitAtIndex and +// GetNumCompileUnits are stubbed out. +class FakeSymbolFile : public SymbolFile { +public: + /// LLVM RTTI support. + /// \{ + bool isA(const void *ClassID) const override { + return ClassID == &ID || SymbolFile::isA(ClassID); + } + static bool classof(const SymbolFile *obj) { return obj->isA(&ID); } + /// \} + + static void Initialize() { + PluginManager::RegisterPlugin("FakeSymbolFile", "", CreateInstance, + DebuggerInitialize); + } + static void Terminate() { PluginManager::UnregisterPlugin(CreateInstance); } + + void InjectCompileUnit(std::unique_ptr cu_up) { + m_cu_sp = std::move(cu_up); + } + +private: + /// LLVM RTTI support. + static char ID; + + static SymbolFile *CreateInstance(ObjectFileSP objfile_sp) { + return new FakeSymbolFile(std::move(objfile_sp)); + } + static void DebuggerInitialize(Debugger &) {} + + StringRef GetPluginName() override { return "FakeSymbolFile"; } + uint32_t GetAbilities() override { return UINT32_MAX; } + uint32_t CalculateAbilities() override { return UINT32_MAX; } + uint32_t GetNumCompileUnits() override { return 1; } + CompUnitSP GetCompileUnitAtIndex(uint32_t) override { return m_cu_sp; } + Symtab *GetSymtab() override { return nullptr; } + LanguageType ParseLanguage(CompileUnit &) override { return eLanguageTypeC; } + size_t ParseFunctions(CompileUnit &) override { return 0; } + bool ParseLineTable(CompileUnit &) override { return true; } + bool ParseDebugMacros(CompileUnit &) override { return true; } + bool ParseSupportFiles(CompileUnit &, SupportFileList &) override { + return true; + } + size_t ParseTypes(CompileUnit &) override { return 0; } + bool ParseImportedModules(const SymbolContext &, + std::vector &) override { + return false; + } + size_t ParseBlocksRecursive(Function &) override { return 0; } + size_t ParseVariablesForContext(const SymbolContext &) override { return 0; } + Type *ResolveTypeUID(user_id_t) override { return nullptr; } + std::optional + GetDynamicArrayInfoForUID(user_id_t, const ExecutionContext *) override { + return std::nullopt; + } + bool CompleteType(CompilerType &) override { return true; } + uint32_t ResolveSymbolContext(const Address &, SymbolContextItem, + SymbolContext &) override { + return 0; + } + void GetTypes(SymbolContextScope *, TypeClass, TypeList &) override {} + Expected GetTypeSystemForLanguage(LanguageType) override { + return createStringError(std::errc::not_supported, ""); + } + const ObjectFile *GetObjectFile() const override { + return m_objfile_sp.get(); + } + ObjectFile *GetObjectFile() override { return m_objfile_sp.get(); } + ObjectFile *GetMainObjectFile() override { return m_objfile_sp.get(); } + void SectionFileAddressesChanged() override {} + void Dump(Stream &) override {} + uint64_t GetDebugInfoSize(bool) override { return 0; } + bool GetDebugInfoIndexWasLoadedFromCache() const override { return false; } + void SetDebugInfoIndexWasLoadedFromCache() override {} + bool GetDebugInfoIndexWasSavedToCache() const override { return false; } + void SetDebugInfoIndexWasSavedToCache() override {} + bool GetDebugInfoHadFrameVariableErrors() const override { return false; } + void SetDebugInfoHadFrameVariableErrors() override {} + TypeSP MakeType(user_id_t, ConstString, std::optional, + SymbolContextScope *, user_id_t, Type::EncodingDataType, + const Declaration &, const CompilerType &, Type::ResolveState, + uint32_t) override { + return nullptr; + } + TypeSP CopyType(const TypeSP &) override { return nullptr; } + + FakeSymbolFile(ObjectFileSP objfile_sp) + : m_objfile_sp(std::move(objfile_sp)) {} + + ObjectFileSP m_objfile_sp; + CompUnitSP m_cu_sp; +}; + +struct FakeModuleFixture { + TestFile file; + ModuleSP module_sp; + SectionSP text_sp; + LineTable *line_table; +}; + +class LineTableTest : public testing::Test { + SubsystemRAII subsystems; +}; + +class LineSequenceBuilder { +public: + std::vector> Build() { + return std::move(m_sequences); + } + enum Terminal : bool { Terminal = true }; + void Entry(addr_t addr, bool terminal = false) { + LineTable::AppendLineEntryToSequence( + m_seq_up.get(), addr, /*line=*/1, /*column=*/0, + /*file_idx=*/0, + /*is_start_of_statement=*/false, /*is_start_of_basic_block=*/false, + /*is_prologue_end=*/false, /*is_epilogue_begin=*/false, terminal); + if (terminal) { + m_sequences.push_back(std::move(m_seq_up)); + m_seq_up = LineTable::CreateLineSequenceContainer(); + } + } + +private: + std::vector> m_sequences; + std::unique_ptr m_seq_up = + LineTable::CreateLineSequenceContainer(); +}; + +} // namespace + +char FakeSymbolFile::ID; + +static llvm::Expected +CreateFakeModule(std::vector> line_sequences) { + Expected file = TestFile::fromYaml(R"( +--- !ELF +FileHeader: + Class: ELFCLASS64 + Data: ELFDATA2LSB + Type: ET_EXEC + Machine: EM_386 +Sections: + - Name: .text + Type: SHT_PROGBITS + Flags: [ SHF_ALLOC, SHF_EXECINSTR ] + AddressAlign: 0x0010 + Address: 0x0000 + Size: 0x1000 +)"); + if (!file) + return file.takeError(); + + auto module_sp = std::make_shared(file->moduleSpec()); + SectionSP text_sp = + module_sp->GetSectionList()->FindSectionByName(ConstString(".text")); + if (!text_sp) + return createStringError("No .text"); + + auto cu_up = std::make_unique(module_sp, /*user_data=*/nullptr, + /*support_file_sp=*/nullptr, + /*uid=*/0, eLanguageTypeC, + /*is_optimized=*/eLazyBoolNo); + LineTable *line_table = new LineTable(cu_up.get(), std::move(line_sequences)); + cu_up->SetLineTable(line_table); + cast(module_sp->GetSymbolFile()) + ->InjectCompileUnit(std::move(cu_up)); + + return FakeModuleFixture{std::move(*file), std::move(module_sp), + std::move(text_sp), line_table}; +} + +TEST_F(LineTableTest, LowerAndUpperBound) { + LineSequenceBuilder builder; + builder.Entry(0); + builder.Entry(10); + builder.Entry(20, LineSequenceBuilder::Terminal); + builder.Entry(20); // Starts right after the previous sequence. + builder.Entry(30, LineSequenceBuilder::Terminal); + builder.Entry(40); // Gap after the previous sequence. + builder.Entry(50, LineSequenceBuilder::Terminal); + + llvm::Expected fixture = CreateFakeModule(builder.Build()); + ASSERT_THAT_EXPECTED(fixture, llvm::Succeeded()); + + LineTable *table = fixture->line_table; + + auto make_addr = [&](addr_t addr) { return Address(fixture->text_sp, addr); }; + + // Both functions return the same value for boundary values. This way the + // index range for e.g. [0,10) is [0,1). + EXPECT_EQ(table->lower_bound(make_addr(0)), 0u); + EXPECT_EQ(table->upper_bound(make_addr(0)), 0u); + EXPECT_EQ(table->lower_bound(make_addr(10)), 1u); + EXPECT_EQ(table->upper_bound(make_addr(10)), 1u); + EXPECT_EQ(table->lower_bound(make_addr(20)), 3u); + EXPECT_EQ(table->upper_bound(make_addr(20)), 3u); + + // In case there's no "real" entry at this address, they return the first real + // entry. + EXPECT_EQ(table->lower_bound(make_addr(30)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(30)), 5u); + + EXPECT_EQ(table->lower_bound(make_addr(40)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(40)), 5u); + + // For in-between values, their result differs by one. [9,19) maps to [0,2) + // because the first two entries contain a part of that range. + EXPECT_EQ(table->lower_bound(make_addr(9)), 0u); + EXPECT_EQ(table->upper_bound(make_addr(9)), 1u); + EXPECT_EQ(table->lower_bound(make_addr(19)), 1u); + EXPECT_EQ(table->upper_bound(make_addr(19)), 2u); + EXPECT_EQ(table->lower_bound(make_addr(29)), 3u); + EXPECT_EQ(table->upper_bound(make_addr(29)), 4u); + + // In a gap, they both return the first entry after the gap. + EXPECT_EQ(table->upper_bound(make_addr(39)), 5u); + EXPECT_EQ(table->upper_bound(make_addr(39)), 5u); + + // And if there's no such entry, they return the size of the list. + EXPECT_EQ(table->lower_bound(make_addr(50)), table->GetSize()); + EXPECT_EQ(table->upper_bound(make_addr(50)), table->GetSize()); + EXPECT_EQ(table->lower_bound(make_addr(59)), table->GetSize()); + EXPECT_EQ(table->upper_bound(make_addr(59)), table->GetSize()); +} + +TEST_F(LineTableTest, FindLineEntryByAddress) { + LineSequenceBuilder builder; + builder.Entry(0); + builder.Entry(10); + builder.Entry(20, LineSequenceBuilder::Terminal); + builder.Entry(20); // Starts right after the previous sequence. + builder.Entry(30, LineSequenceBuilder::Terminal); + builder.Entry(40); // Gap after the previous sequence. + builder.Entry(50, LineSequenceBuilder::Terminal); + + llvm::Expected fixture = CreateFakeModule(builder.Build()); + ASSERT_THAT_EXPECTED(fixture, llvm::Succeeded()); + + LineTable *table = fixture->line_table; + + auto find = [&](addr_t addr) -> std::tuple { + LineEntry entry; + if (!table->FindLineEntryByAddress(Address(fixture->text_sp, addr), entry)) + return {LLDB_INVALID_ADDRESS, LLDB_INVALID_ADDRESS, false}; + return {entry.range.GetBaseAddress().GetFileAddress(), + entry.range.GetByteSize(), + static_cast(entry.is_terminal_entry)}; + }; + + EXPECT_THAT(find(0), testing::FieldsAre(0, 10, false)); + EXPECT_THAT(find(9), testing::FieldsAre(0, 10, false)); + EXPECT_THAT(find(10), testing::FieldsAre(10, 10, false)); + EXPECT_THAT(find(19), testing::FieldsAre(10, 10, false)); + EXPECT_THAT(find(20), testing::FieldsAre(20, 10, false)); + EXPECT_THAT(find(30), testing::FieldsAre(LLDB_INVALID_ADDRESS, + LLDB_INVALID_ADDRESS, false)); + EXPECT_THAT(find(40), testing::FieldsAre(40, 10, false)); + EXPECT_THAT(find(50), testing::FieldsAre(LLDB_INVALID_ADDRESS, + LLDB_INVALID_ADDRESS, false)); +} diff --git a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst index 90797499dec22..7603bcc95383b 100644 --- a/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst +++ b/llvm/docs/AMDGPU/AMDGPUAsmGFX940.rst @@ -6,7 +6,7 @@ ************************************************** ==================================================================================== -Syntax of gfx940 Instructions +Syntax of gfx942 Instructions ==================================================================================== .. contents:: @@ -15,7 +15,7 @@ Syntax of gfx940 Instructions Introduction ============ -This document describes the syntax of gfx940 instructions. +This document describes the syntax of gfx942 instructions. Notation ======== diff --git a/llvm/docs/AMDGPUOperandSyntax.rst b/llvm/docs/AMDGPUOperandSyntax.rst index ff6ec6cf71ff2..e8a76322fe76a 100644 --- a/llvm/docs/AMDGPUOperandSyntax.rst +++ b/llvm/docs/AMDGPUOperandSyntax.rst @@ -63,7 +63,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *vector* registers must be even-aligned (first register must be even). @@ -183,7 +183,7 @@ Note: *N* and *K* must satisfy the following conditions: * 0 <= *K* <= 255. * *K-N+1* must be in the range from 1 to 12 or equal to 16 or 32. -GFX90A and GFX940 have an additional alignment requirement: +GFX90A and GFX942 have an additional alignment requirement: pairs of *accumulator* registers must be even-aligned (first register must be even). diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 5966d1617feee..d580be1eb8cfc 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following Add product names. - **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_ + **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX942-CDNA3]_ ----------------------------------------------------------------------------------------------------------------------- ``gfx900`` ``amdgcn`` dGPU - xnack - Absolute - *rocm-amdhsa* - Radeon Vega flat - *pal-amdhsa* Frontier Edition @@ -378,20 +378,6 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following - Ryzen 3 Pro 4350G - Ryzen 3 Pro 4350GE - ``gfx940`` ``amdgcn`` dGPU - sramecc - Architected *TBA* - - tgsplit flat - - xnack scratch .. TODO:: - - kernarg preload - Packed - work-item Add product - IDs names. - - ``gfx941`` ``amdgcn`` dGPU - sramecc - Architected *TBA* - - tgsplit flat - - xnack scratch .. TODO:: - - kernarg preload - Packed - work-item Add product - IDs names. - ``gfx942`` ``amdgcn`` dGPU - sramecc - Architected - AMD Instinct MI300X - tgsplit flat - AMD Instinct MI300A - xnack scratch @@ -583,10 +569,10 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor - ``v_dot2_f32_f16`` - ``gfx9-4-generic`` ``amdgcn`` - ``gfx940`` - sramecc - Architected FP8 and BF8 instructions, - - ``gfx941`` - tgsplit flat scratch FP8 and BF8 conversion - - ``gfx942`` - xnack - Packed instructions, as well as - - ``gfx950`` - kernarg preload work-item instructions with XF32 format + ``gfx9-4-generic`` ``amdgcn`` - ``gfx942`` - sramecc - Architected FP8 and BF8 instructions, + - ``gfx950`` - tgsplit flat scratch FP8 and BF8 conversion + - xnack - Packed instructions, as well as + - kernarg preload work-item instructions with XF32 format IDs support are not available. ``gfx10-1-generic`` ``amdgcn`` - ``gfx1010`` - xnack - Absolute flat - The following instructions are @@ -2232,7 +2218,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX1035`` 0x03d ``gfx1035`` ``EF_AMDGPU_MACH_AMDGCN_GFX1034`` 0x03e ``gfx1034`` ``EF_AMDGPU_MACH_AMDGCN_GFX90A`` 0x03f ``gfx90a`` - ``EF_AMDGPU_MACH_AMDGCN_GFX940`` 0x040 ``gfx940`` + *reserved* 0x040 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1100`` 0x041 ``gfx1100`` ``EF_AMDGPU_MACH_AMDGCN_GFX1013`` 0x042 ``gfx1013`` ``EF_AMDGPU_MACH_AMDGCN_GFX1150`` 0x043 ``gfx1150`` @@ -2243,7 +2229,7 @@ The AMDGPU backend uses the following ELF header: ``EF_AMDGPU_MACH_AMDGCN_GFX1200`` 0x048 ``gfx1200`` *reserved* 0x049 Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1151`` 0x04a ``gfx1151`` - ``EF_AMDGPU_MACH_AMDGCN_GFX941`` 0x04b ``gfx941`` + *reserved* 0x04b Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX942`` 0x04c ``gfx942`` *reserved* 0x04d Reserved. ``EF_AMDGPU_MACH_AMDGCN_GFX1201`` 0x04e ``gfx1201`` @@ -4985,7 +4971,7 @@ The fields used by CP for code objects before V3 also match those specified in bytes 383:352 4 bytes COMPUTE_PGM_RSRC3 GFX6-GFX9 Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 Compute Shader (CS) program settings used by CP to set up @@ -5070,7 +5056,7 @@ The fields used by CP for code objects before V3 also match those specified in 463:460 4 bits Reserved, must be 0. 470:464 7 bits KERNARG_PRELOAD_SPEC_LENGTH GFX6-GFX9 - Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 - The number of dwords from the kernarg segment to preload into User SGPRs before kernel @@ -5078,7 +5064,7 @@ The fields used by CP for code objects before V3 also match those specified in :ref:`amdgpu-amdhsa-kernarg-preload`). 479:471 9 bits KERNARG_PRELOAD_SPEC_OFFSET GFX6-GFX9 - Reserved, must be 0. - GFX90A, GFX940 + GFX90A, GFX942 - An offset in dwords into the kernarg segment to begin preloading data into User @@ -5104,7 +5090,7 @@ The fields used by CP for code objects before V3 also match those specified in GFX6-GFX9 - vgprs_used 0..256 - max(0, ceil(vgprs_used / 4) - 1) - GFX90A, GFX940 + GFX90A, GFX942 - vgprs_used 0..512 - vgprs_used = align(arch_vgprs, 4) + acc_vgprs @@ -5570,7 +5556,7 @@ The fields used by CP for code objects before V3 also match those specified in .. - .. table:: compute_pgm_rsrc3 for GFX90A, GFX940 + .. table:: compute_pgm_rsrc3 for GFX90A, GFX942 :name: amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table ======= ======= =============================== =========================================================================== @@ -9981,15 +9967,15 @@ only accessed by a single thread, and is always write-before-read, there is never a need to invalidate these entries from the L1 cache. Hence all cache invalidates are done as ``*_vol`` to only invalidate the volatile cache lines. -The code sequences used to implement the memory model for GFX940, GFX941, GFX942 -are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx941-gfx942-table`. +The code sequences used to implement the memory model for GFX942 are defined in +table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx942-table`. - .. table:: AMDHSA Memory Model Code Sequences GFX940, GFX941, GFX942 - :name: amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx941-gfx942-table + .. table:: AMDHSA Memory Model Code Sequences GFX942 + :name: amdgpu-amdhsa-memory-model-code-sequences-gfx942-table ============ ============ ============== ========== ================================ LLVM Instr LLVM Memory LLVM Memory AMDGPU AMDGPU Machine Code - Ordering Sync Scope Address GFX940, GFX941, GFX942 + Ordering Sync Scope Address GFX942 Space ============ ============ ============== ========== ================================ **Non-Atomic** @@ -10024,18 +10010,12 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 load *none* *none* - local 1. ds_load store *none* *none* - global - !volatile & !nontemporal - generic - - private 1. GFX940, GFX941 + - private 1. GFX942 - constant buffer/global/flat_store - sc0=1 sc1=1 - GFX942 - buffer/global/flat_store - !volatile & nontemporal - 1. GFX940, GFX941 - buffer/global/flat_store - nt=1 sc0=1 sc1=1 - GFX942 + 1. GFX942 buffer/global/flat_store nt=1 @@ -10707,11 +10687,8 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 **Release Atomic** ------------------------------------------------------------------------------------ - store atomic release - singlethread - global 1. GFX940, GFX941 + store atomic release - singlethread - global 1. GFX942 - wavefront - generic buffer/global/flat_store - sc0=1 sc1=1 - GFX942 - buffer/global/flat_store store atomic release - singlethread - local *If TgSplit execution mode, - wavefront local address space cannot @@ -10749,10 +10726,7 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 store that is being released. - 2. GFX940, GFX941 - buffer/global/flat_store - sc0=1 sc1=1 - GFX942 + 2. GFX942 buffer/global/flat_store sc0=1 store atomic release - workgroup - local *If TgSplit execution mode, @@ -10813,10 +10787,7 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9 store that is being released. - 3. GFX940, GFX941 - buffer/global/flat_store - sc0=1 sc1=1 - GFX942 + 3. GFX942 buffer/global/flat_store sc1=1 store atomic release - system - global 1. buffer_wbl2 sc0=1 sc1=1 @@ -17574,11 +17545,7 @@ in this description. CDNA 2 :doc:`GFX9` :doc:`gfx90a` - CDNA 3 :doc:`GFX9` :doc:`gfx940` - - :doc:`gfx941` - - :doc:`gfx942` + CDNA 3 :doc:`GFX9` :doc:`gfx942` RDNA 1 :doc:`GFX10 RDNA1` :doc:`gfx1010` @@ -17616,7 +17583,7 @@ combinations of operands, refer to one of instruction set architecture manuals [AMD-GCN-GFX6]_, [AMD-GCN-GFX7]_, [AMD-GCN-GFX8]_, [AMD-GCN-GFX900-GFX904-VEGA]_, [AMD-GCN-GFX906-VEGA7NM]_, [AMD-GCN-GFX908-CDNA1]_, [AMD-GCN-GFX90A-CDNA2]_, -[AMD-GCN-GFX940-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_, +[AMD-GCN-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_, [AMD-GCN-GFX11-RDNA3]_ and [AMD-GCN-GFX11-RDNA3.5]_. Operands @@ -18129,7 +18096,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table` ``.amdhsa_user_sgpr_private_segment_buffer`` 0 GFX6-GFX10 Controls ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER in (except :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - GFX940) + GFX942) ``.amdhsa_user_sgpr_dispatch_ptr`` 0 GFX6-GFX12 Controls ENABLE_SGPR_DISPATCH_PTR in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_queue_ptr`` 0 GFX6-GFX12 Controls ENABLE_SGPR_QUEUE_PTR in @@ -18140,7 +18107,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_flat_scratch_init`` 0 GFX6-GFX10 Controls ENABLE_SGPR_FLAT_SCRATCH_INIT in (except :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. - GFX940) + GFX942) ``.amdhsa_user_sgpr_private_segment_size`` 0 GFX6-GFX12 Controls ENABLE_SGPR_PRIVATE_SEGMENT_SIZE in :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_wavefront_size32`` Target GFX10-GFX12 Controls ENABLE_WAVEFRONT_SIZE32 in @@ -18151,8 +18118,8 @@ terminated by an ``.end_amdhsa_kernel`` directive. :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_system_sgpr_private_segment_wavefront_offset`` 0 GFX6-GFX10 Controls ENABLE_PRIVATE_SEGMENT in (except :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. - GFX940) - ``.amdhsa_enable_private_segment`` 0 GFX940, Controls ENABLE_PRIVATE_SEGMENT in + GFX942) + ``.amdhsa_enable_private_segment`` 0 GFX942, Controls ENABLE_PRIVATE_SEGMENT in GFX11-GFX12 :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. ``.amdhsa_system_sgpr_workgroup_id_x`` 1 GFX6-GFX12 Controls ENABLE_SGPR_WORKGROUP_ID_X in :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. @@ -18173,14 +18140,14 @@ terminated by an ``.end_amdhsa_kernel`` directive. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_accum_offset`` Required GFX90A, Offset of a first AccVGPR in the unified register file. - GFX940 Used to calculate ACCUM_OFFSET in + GFX942 Used to calculate ACCUM_OFFSET in :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. ``.amdhsa_reserve_vcc`` 1 GFX6-GFX12 Whether the kernel may use the special VCC SGPR. Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_reserve_flat_scratch`` 1 GFX7-GFX10 Whether the kernel may use flat instructions to access (except scratch memory. Used to calculate - GFX940) GRANULATED_WAVEFRONT_SGPR_COUNT in + GFX942) GRANULATED_WAVEFRONT_SGPR_COUNT in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_reserve_xnack_mask`` Target GFX8-GFX10 Whether the kernel may trigger XNACK replay. Feature Used to calculate GRANULATED_WAVEFRONT_SGPR_COUNT in @@ -18211,7 +18178,7 @@ terminated by an ``.end_amdhsa_kernel`` directive. ``.amdhsa_fp16_overflow`` 0 GFX9-GFX12 Controls FP16_OVFL in :ref:`amdgpu-amdhsa-compute_pgm_rsrc1-gfx6-gfx12-table`. ``.amdhsa_tg_split`` Target GFX90A, Controls TG_SPLIT in - Feature GFX940, :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. + Feature GFX942, :ref:`amdgpu-amdhsa-compute_pgm_rsrc3-gfx90a-table`. Specific GFX11-GFX12 (tgsplit) ``.amdhsa_workgroup_processor_mode`` Target GFX10-GFX12 Controls ENABLE_WGP_MODE in @@ -18242,9 +18209,9 @@ terminated by an ``.end_amdhsa_kernel`` directive. ``.amdhsa_exception_int_div_zero`` 0 GFX6-GFX12 Controls ENABLE_EXCEPTION_INT_DIVIDE_BY_ZERO in :ref:`amdgpu-amdhsa-compute_pgm_rsrc2-gfx6-gfx12-table`. ``.amdhsa_user_sgpr_kernarg_preload_length`` 0 GFX90A, Controls KERNARG_PRELOAD_SPEC_LENGTH in - GFX940 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. + GFX942 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ``.amdhsa_user_sgpr_kernarg_preload_offset`` 0 GFX90A, Controls KERNARG_PRELOAD_SPEC_OFFSET in - GFX940 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. + GFX942 :ref:`amdgpu-amdhsa-kernel-descriptor-v3-table`. ======================================================== =================== ============ =================== .amdgpu_metadata @@ -18414,7 +18381,7 @@ Additional Documentation .. [AMD-GCN-GFX906-VEGA7NM] `AMD Vega 7nm Instruction Set Architecture `__ .. [AMD-GCN-GFX908-CDNA1] `AMD Instinct MI100 Instruction Set Architecture `__ .. [AMD-GCN-GFX90A-CDNA2] `AMD Instinct MI200 Instruction Set Architecture `__ -.. [AMD-GCN-GFX940-GFX942-CDNA3] `AMD Instinct MI300 Instruction Set Architecture `__ +.. [AMD-GCN-GFX942-CDNA3] `AMD Instinct MI300 Instruction Set Architecture `__ .. [AMD-GCN-GFX10-RDNA1] `AMD RDNA 1.0 Instruction Set Architecture `__ .. [AMD-GCN-GFX10-RDNA2] `AMD RDNA 2 Instruction Set Architecture `__ .. [AMD-GCN-GFX11-RDNA3] `AMD RDNA 3 Instruction Set Architecture `__ diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 8550af456e961..675b458c41e7b 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1183,6 +1183,93 @@ operations. For more information, refer to the PTX ISA ``_. +'``llvm.nvvm.tcgen05.shift``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.shift.{cg1/cg2}``' intrinsics correspond to +the ``tcgen05.shift.{cg1/cg2}`` PTX instructions. The ``tcgen05.shift`` +is an asynchronous instruction which initiates the shifting of 32-byte +elements downwards across all the rows, except the last, by one row. +The address operand ``%tmem_addr`` specifies the base address of the +matrix in the Tensor Memory whose rows must be down shifted. + +For more information, refer to the PTX ISA +``_. + +'``llvm.nvvm.tcgen05.cp``' +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.tcgen05.cp.4x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + + declare void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + declare void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.{cg1,cg2}(ptr addrspace(6) %tmem_addr, i64 %sdesc) + +Overview: +""""""""" + +The '``@llvm.nvvm.tcgen05.cp.{shape}.{src_fmt}.{cg1/cg2}``' intrinsics +correspond to the ``tcgen05.cp.*`` family of PTX instructions. +The ``tcgen05.cp`` instruction initiates an asynchronous copy operation from +shared memory to the location specified by ``%tmem_addr`` in Tensor Memory. +The 64-bit register operand ``%sdesc`` is the matrix descriptor representing +the source matrix in shared memory that needs to be copied. + +The valid shapes for the copy operation are: +{128x256b, 4x256b, 128x128b, 64x128b_warpx2_02_13, 64x128b_warpx2_01_23, 32x128b_warpx4}. + +Shapes ``64x128b`` and ``32x128b`` require dedicated multicast qualifiers, +which are appended to the corresponding intrinsic names. + +Optionally, the data can be decompressed from the source format in the shared memory +to the destination format in Tensor Memory during the copy operation. Currently, +only ``.b8x16`` is supported as destination format. The valid source formats are +``.b6x16_p32`` and ``.b4x16_p64``. + +When the source format is ``.b6x16_p32``, a contiguous set of 16 elements of 6-bits +each followed by four bytes of padding (``_p32``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +When the source format is ``.b4x16_p64``, a contiguous set of 16 elements of 4-bits +each followed by eight bytes of padding (``_p64``) in shared memory is decompressed +into 16 elements of 8-bits (``.b8x16``) each in the Tensor Memory. + +For more information on the decompression schemes, refer to the PTX ISA +``_. + +For more information on the tcgen05.cp instruction, refer to the PTX ISA +``_. Other Intrinsics ---------------- diff --git a/llvm/include/llvm/Analysis/CaptureTracking.h b/llvm/include/llvm/Analysis/CaptureTracking.h index 573df8833bd46..06a00d9ae7899 100644 --- a/llvm/include/llvm/Analysis/CaptureTracking.h +++ b/llvm/include/llvm/Analysis/CaptureTracking.h @@ -14,13 +14,11 @@ #define LLVM_ANALYSIS_CAPTURETRACKING_H #include "llvm/ADT/DenseMap.h" -#include "llvm/Support/ModRef.h" namespace llvm { class Value; class Use; - class CaptureInfo; class DataLayout; class Instruction; class DominatorTree; @@ -79,47 +77,10 @@ namespace llvm { const DominatorTree &DT, unsigned MaxUsesToExplore = 0); - /// Capture information for a specific Use. - struct UseCaptureInfo { - /// Components captured by this use. - CaptureComponents UseCC; - /// Components captured by the return value of the user of this Use. - CaptureComponents ResultCC; - - UseCaptureInfo(CaptureComponents UseCC, - CaptureComponents ResultCC = CaptureComponents::None) - : UseCC(UseCC), ResultCC(ResultCC) {} - - static UseCaptureInfo passthrough() { - return UseCaptureInfo(CaptureComponents::None, CaptureComponents::All); - } - - bool isPassthrough() const { - return capturesNothing(UseCC) && capturesAnything(ResultCC); - } - - operator CaptureComponents() const { return UseCC | ResultCC; } - }; - /// This callback is used in conjunction with PointerMayBeCaptured. In /// addition to the interface here, you'll need to provide your own getters /// to see whether anything was captured. struct CaptureTracker { - /// Action returned from captures(). - enum Action { - /// Stop the traversal. - Stop, - /// Continue traversal, and also follow the return value of the user if - /// it has additional capture components (that is, if it has capture - /// components in Ret that are not part of Other). - Continue, - /// Continue traversal, but do not follow the return value of the user, - /// even if it has additional capture components. Should only be used if - /// captures() has already taken the potential return captures into - /// account. - ContinueIgnoringReturn, - }; - virtual ~CaptureTracker(); /// tooManyUses - The depth of traversal has breached a limit. There may be @@ -133,12 +94,10 @@ namespace llvm { /// U->getUser() is always an Instruction. virtual bool shouldExplore(const Use *U); - /// Use U directly captures CI.UseCC and additionally CI.ResultCC - /// through the return value of the user of U. - /// - /// Return one of Stop, Continue or ContinueIgnoringReturn to control - /// further traversal. - virtual Action captured(const Use *U, UseCaptureInfo CI) = 0; + /// captured - Information about the pointer was captured by the user of + /// use U. Return true to stop the traversal or false to continue looking + /// for more capturing instructions. + virtual bool captured(const Use *U) = 0; /// isDereferenceableOrNull - Overload to allow clients with additional /// knowledge about pointer dereferenceability to provide it and thereby @@ -146,18 +105,21 @@ namespace llvm { virtual bool isDereferenceableOrNull(Value *O, const DataLayout &DL); }; + /// Types of use capture kinds, see \p DetermineUseCaptureKind. + enum class UseCaptureKind { + NO_CAPTURE, + MAY_CAPTURE, + PASSTHROUGH, + }; + /// Determine what kind of capture behaviour \p U may exhibit. /// - /// The returned UseCaptureInfo contains the components captured directly - /// by the use (UseCC) and the components captured through the return value - /// of the user (ResultCC). - /// - /// \p Base is the starting value of the capture analysis, which is - /// relevant for address_is_null captures. + /// A use can be no-capture, a use can potentially capture, or a use can be + /// passthrough such that the uses of the user or \p U should be inspected. /// The \p IsDereferenceableOrNull callback is used to rule out capturing for /// certain comparisons. - UseCaptureInfo - DetermineUseCaptureKind(const Use &U, const Value *Base, + UseCaptureKind + DetermineUseCaptureKind(const Use &U, llvm::function_ref IsDereferenceableOrNull); diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h index 64f643749d6ac..37eab89e706db 100644 --- a/llvm/include/llvm/BinaryFormat/ELF.h +++ b/llvm/include/llvm/BinaryFormat/ELF.h @@ -814,7 +814,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1035 = 0x03d, EF_AMDGPU_MACH_AMDGCN_GFX1034 = 0x03e, EF_AMDGPU_MACH_AMDGCN_GFX90A = 0x03f, - EF_AMDGPU_MACH_AMDGCN_GFX940 = 0x040, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X40 = 0x040, EF_AMDGPU_MACH_AMDGCN_GFX1100 = 0x041, EF_AMDGPU_MACH_AMDGCN_GFX1013 = 0x042, EF_AMDGPU_MACH_AMDGCN_GFX1150 = 0x043, @@ -825,7 +825,7 @@ enum : unsigned { EF_AMDGPU_MACH_AMDGCN_GFX1200 = 0x048, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X49 = 0x049, EF_AMDGPU_MACH_AMDGCN_GFX1151 = 0x04a, - EF_AMDGPU_MACH_AMDGCN_GFX941 = 0x04b, + EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4B = 0x04b, EF_AMDGPU_MACH_AMDGCN_GFX942 = 0x04c, EF_AMDGPU_MACH_AMDGCN_RESERVED_0X4D = 0x04d, EF_AMDGPU_MACH_AMDGCN_GFX1201 = 0x04e, diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index cf8e4a3d2513b..aa0dfbe666cde 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -873,7 +873,7 @@ class SelectionDAG { /// for integers, a type wider than) VT's element type. SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op) { // VerifySDNode (via InsertNode) checks BUILD_VECTOR later. - if (Op.getOpcode() == ISD::UNDEF) { + if (Op.isUndef()) { assert((VT.getVectorElementType() == Op.getValueType() || (VT.isInteger() && VT.getVectorElementType().bitsLE(Op.getValueType()))) && @@ -889,7 +889,7 @@ class SelectionDAG { // Return a splat ISD::SPLAT_VECTOR node, consisting of Op splatted to all // elements. SDValue getSplatVector(EVT VT, const SDLoc &DL, SDValue Op) { - if (Op.getOpcode() == ISD::UNDEF) { + if (Op.isUndef()) { assert((VT.getVectorElementType() == Op.getValueType() || (VT.isInteger() && VT.getVectorElementType().bitsLE(Op.getValueType()))) && diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h index 8e47e3c7b3a7c..90fe864d4ae71 100644 --- a/llvm/include/llvm/IR/InstrTypes.h +++ b/llvm/include/llvm/IR/InstrTypes.h @@ -1692,11 +1692,6 @@ class CallBase : public Instruction { return capturesNothing(getCaptureInfo(OpNo)); } - /// Returns whether the call has an argument that has an attribute like - /// captures(ret: address, provenance), where the return capture components - /// are not a subset of the other capture components. - bool hasArgumentWithAdditionalReturnCaptureComponents() const; - /// Determine whether this argument is passed by value. bool isByValArgument(unsigned ArgNo) const { return paramHasAttr(ArgNo, Attribute::ByVal); diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 9558f2b9b74e0..1e4f25c642493 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1074,7 +1074,7 @@ class AMDGPUImageDimIntrinsic.DmaskArgIndex>>]), @@ -1321,7 +1321,7 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // Note: volatile bit is **not** permitted here. @@ -1351,7 +1351,7 @@ class AMDGPURawBufferLoad : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1381,7 +1381,7 @@ class AMDGPURawPtrBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1413,7 +1413,7 @@ class AMDGPUStructBufferLoad : DefaultAttrsIntri llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1431,7 +1431,7 @@ class AMDGPUStructAtomicBufferLoad : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1448,7 +1448,7 @@ class AMDGPUStructPtrBufferLoad : DefaultAttrsIn llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1467,7 +1467,7 @@ class AMDGPUStructPtrAtomicBufferLoad : Intrinsi llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1485,7 +1485,7 @@ class AMDGPURawBufferStore : DefaultAttrsIntrins llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1503,7 +1503,7 @@ class AMDGPURawPtrBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1523,7 +1523,7 @@ class AMDGPUStructBufferStore : DefaultAttrsIntr llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1542,7 +1542,7 @@ class AMDGPUStructPtrBufferStore : DefaultAttrsI llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1628,7 +1628,7 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic< // gfx908 intrinsic def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic; -// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx950, gfx12+. +// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx942, gfx950, gfx12+. def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic; class AMDGPUStructBufferAtomic : Intrinsic < @@ -1727,7 +1727,7 @@ def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz [IntrReadMem, @@ -1743,7 +1743,7 @@ def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1761,7 +1761,7 @@ def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1779,7 +1779,7 @@ def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1797,7 +1797,7 @@ def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1815,7 +1815,7 @@ def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1834,7 +1834,7 @@ def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1853,7 +1853,7 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1872,7 +1872,7 @@ class AMDGPURawBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1891,7 +1891,7 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1914,7 +1914,7 @@ class AMDGPUStructBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -1934,7 +1934,7 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic < llvm_i32_ty], // auxiliary/cachepolicy(imm): // bit 0 = glc, bit 1 = slc, bit 2 = dlc (gfx10/gfx11), // bit 3 = swz, bit 4 = scc (gfx90a) - // gfx940: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 + // gfx942: bit 0 = sc0, bit 1 = nt, bit 3 = swz, bit 4 = sc1 // gfx12+: bits [0-2] = th, bits [3-4] = scope, // bit 6 = swz // all: volatile op (bit 31, stripped at lowering) @@ -3007,7 +3007,7 @@ def int_amdgcn_fdot2_f32_bf16 : // f32 %r = llvm.amdgcn.fdot2c.f32.bf16(v2bf16 %a, v2bf16 %b, f32 %c, i1 %clamp) // %r = %a[0] * %b[0] + %a[1] * %b[1] + c // TODO: This actually is similar to llvm.amdgcn.fdot2 intrinsics which produces -// v_dot2c_f32_f16 on gfx940. Maybe we can consolidate these. +// v_dot2c_f32_f16 on gfx942. Maybe we can consolidate these. def int_amdgcn_fdot2c_f32_bf16 : ClangBuiltin<"__builtin_amdgcn_fdot2c_f32_bf16">, @@ -3250,7 +3250,7 @@ def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic; -// Note: in gfx940 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. +// Note: in gfx942 BLGP argument is replaced by NEG bitfield in the DGEMM MFMA. // Three bits corresponding to the neg modifier applied to the respective // source operand. def int_amdgcn_mfma_f64_16x16x4f64 : AMDGPUMfmaIntrinsic; @@ -3258,7 +3258,7 @@ def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic : diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 7ef270f3256a6..c32bf0318b5d6 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -55,6 +55,14 @@ def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr // MISC // +// Helper class that concatenates list elements with +// a given separator 'sep' and returns the result. +// Handles empty strings. +class StrJoin str_list> { + string ret = !foldl("", str_list, a, b, + !if(!eq(a, ""), b, !if(!eq(b, ""), a, !strconcat(a, sep, b)))); +} + // Helper class that represents a 'fragment' of an NVPTX *MMA instruction. // Geom: mnk. E.g. m8n32k16 // Frag: [a|b|c|d] ([x1|x2|x4] for ldmatrix) @@ -5140,6 +5148,11 @@ foreach cta_group = ["cg1", "cg2"] in { [llvm_shared_ptr_ty, llvm_i16_ty], // mbar_ptr, cta_mask [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>]>; + + def int_nvvm_tcgen05_shift_down_ # cta_group : Intrinsic<[], + [llvm_tmem_ptr_ty], // tmem_addr + [IntrConvergent, IntrArgMemOnly, + NoCapture>]>; } // Tcgen05 wait_ld/st intrinsics @@ -5154,4 +5167,23 @@ def int_nvvm_tcgen05_fence_before_thread_sync : Intrinsic<[], [], def int_nvvm_tcgen05_fence_after_thread_sync : Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects]>; +// Tcgen05 cp intrinsics +foreach cta_group = ["cg1", "cg2"] in { + foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + foreach shape = ["128x256b", "4x256b", "128x128b", + "64x128b_warpx2_02_13", + "64x128b_warpx2_01_23", + "32x128b_warpx4"] in { + defvar intr_suffix = StrJoin<"_", [shape, src_fmt, cta_group]>.ret; + defvar name_suffix = StrJoin<".", [shape, src_fmt, cta_group]>.ret; + + def int_nvvm_tcgen05_cp_ # intr_suffix : Intrinsic<[], + [llvm_tmem_ptr_ty, // tmem_addr + llvm_i64_ty], // smem descriptor + [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture>], + "llvm.nvvm.tcgen05.cp." # name_suffix>; + } + } +} + } // let TargetPrefix = "nvvm" diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def index a7963543c4350..c6ac341d71a20 100644 --- a/llvm/include/llvm/IR/RuntimeLibcalls.def +++ b/llvm/include/llvm/IR/RuntimeLibcalls.def @@ -384,8 +384,8 @@ HANDLE_LIBCALL(FPEXT_F16_F128, "__extendhftf2") HANDLE_LIBCALL(FPEXT_F16_F80, "__extendhfxf2") HANDLE_LIBCALL(FPEXT_F32_F64, "__extendsfdf2") HANDLE_LIBCALL(FPEXT_F16_F64, "__extendhfdf2") -HANDLE_LIBCALL(FPEXT_F16_F32, "__gnu_h2f_ieee") -HANDLE_LIBCALL(FPROUND_F32_F16, "__gnu_f2h_ieee") +HANDLE_LIBCALL(FPEXT_F16_F32, "__extendhfsf2") +HANDLE_LIBCALL(FPROUND_F32_F16, "__truncsfhf2") HANDLE_LIBCALL(FPROUND_F64_F16, "__truncdfhf2") HANDLE_LIBCALL(FPROUND_F80_F16, "__truncxfhf2") HANDLE_LIBCALL(FPROUND_F128_F16, "__trunctfhf2") diff --git a/llvm/include/llvm/Support/DebugCounter.h b/llvm/include/llvm/Support/DebugCounter.h index e4345e5739e99..8e9dc29e4f48a 100644 --- a/llvm/include/llvm/Support/DebugCounter.h +++ b/llvm/include/llvm/Support/DebugCounter.h @@ -162,8 +162,9 @@ class DebugCounter { protected: unsigned addCounter(const std::string &Name, const std::string &Desc) { unsigned Result = RegisteredCounters.insert(Name); - Counters[Result] = {}; - Counters[Result].Desc = Desc; + auto &C = Counters[Result]; + C = {}; + C.Desc = Desc; return Result; } // Struct to store counter info. diff --git a/llvm/include/llvm/Support/ModRef.h b/llvm/include/llvm/Support/ModRef.h index 7f58f5236aedd..eb660844b0b3a 100644 --- a/llvm/include/llvm/Support/ModRef.h +++ b/llvm/include/llvm/Support/ModRef.h @@ -326,10 +326,6 @@ inline bool capturesFullProvenance(CaptureComponents CC) { return (CC & CaptureComponents::Provenance) == CaptureComponents::Provenance; } -inline bool capturesAll(CaptureComponents CC) { - return CC == CaptureComponents::All; -} - raw_ostream &operator<<(raw_ostream &OS, CaptureComponents CC); /// Represents which components of the pointer may be captured in which @@ -354,15 +350,6 @@ class CaptureInfo { /// Create CaptureInfo that may capture all components of the pointer. static CaptureInfo all() { return CaptureInfo(CaptureComponents::All); } - /// Create CaptureInfo that may only capture via the return value. - static CaptureInfo - retOnly(CaptureComponents RetComponents = CaptureComponents::All) { - return CaptureInfo(CaptureComponents::None, RetComponents); - } - - /// Whether the pointer is only captured via the return value. - bool isRetOnly() const { return capturesNothing(OtherComponents); } - /// Get components potentially captured by the return value. CaptureComponents getRetComponents() const { return RetComponents; } diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index 55e7b417428c4..f776b41f3d7ca 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -83,8 +83,6 @@ enum GPUKind : uint32_t { GK_GFX909 = 65, GK_GFX90A = 66, GK_GFX90C = 67, - GK_GFX940 = 68, - GK_GFX941 = 69, GK_GFX942 = 70, GK_GFX950 = 71, diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp index 58297accc7f1f..1a9136e464d25 100644 --- a/llvm/lib/Analysis/AliasAnalysis.cpp +++ b/llvm/lib/Analysis/AliasAnalysis.cpp @@ -835,15 +835,9 @@ bool llvm::isBaseOfObject(const Value *V) { } bool llvm::isEscapeSource(const Value *V) { - if (auto *CB = dyn_cast(V)) { - if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB, true)) - return false; - - // The return value of a function with a captures(ret: address, provenance) - // attribute is not necessarily an escape source. The return value may - // alias with a non-escaping object. - return !CB->hasArgumentWithAdditionalReturnCaptureComponents(); - } + if (auto *CB = dyn_cast(V)) + return !isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(CB, + true); // The load case works because isNonEscapingLocalObject considers all // stores to be escapes (it passes true for the StoreCaptures argument @@ -859,6 +853,12 @@ bool llvm::isEscapeSource(const Value *V) { if (isa(V)) return true; + // Capture tracking considers insertions into aggregates and vectors as + // captures. As such, extractions from aggregates and vectors are escape + // sources. + if (isa(V)) + return true; + // Same for inttoptr constant expressions. if (auto *CE = dyn_cast(V)) if (CE->getOpcode() == Instruction::IntToPtr) diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp index 21530693c5f18..c27bfa6f3cc2c 100644 --- a/llvm/lib/Analysis/AssumeBundleQueries.cpp +++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp @@ -85,13 +85,14 @@ void llvm::fillMapFromAssume(AssumeInst &Assume, RetainedKnowledgeMap &Result) { if (!CI) continue; uint64_t Val = CI->getZExtValue(); - auto Lookup = Result.find(Key); - if (Lookup == Result.end() || !Lookup->second.count(&Assume)) { - Result[Key][&Assume] = {Val, Val}; + auto [It, Inserted] = Result[Key].try_emplace(&Assume); + if (Inserted) { + It->second = {Val, Val}; continue; } - Lookup->second[&Assume].Min = std::min(Val, Lookup->second[&Assume].Min); - Lookup->second[&Assume].Max = std::max(Val, Lookup->second[&Assume].Max); + auto &MinMax = It->second; + MinMax.Min = std::min(Val, MinMax.Min); + MinMax.Max = std::max(Val, MinMax.Max); } } diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp index 5120b910e7896..49baf2eb84bb3 100644 --- a/llvm/lib/Analysis/CaptureTracking.cpp +++ b/llvm/lib/Analysis/CaptureTracking.cpp @@ -81,15 +81,14 @@ struct SimpleCaptureTracker : public CaptureTracker { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { if (isa(U->getUser()) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; LLVM_DEBUG(dbgs() << "Captured by: " << *U->getUser() << "\n"); Captured = true; - return Stop; + return true; } bool ReturnCaptures; @@ -123,21 +122,19 @@ struct CapturesBefore : public CaptureTracker { return !isPotentiallyReachable(I, BeforeHere, nullptr, DT, LI); } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; // Check isSafeToPrune() here rather than in shouldExplore() to avoid // an expensive reachability query for every instruction we look at. // Instead we only do one for actual capturing candidates. if (isSafeToPrune(I)) - // If the use is not reachable, the instruction result isn't either. - return ContinueIgnoringReturn; + return false; Captured = true; - return Stop; + return true; } const Instruction *BeforeHere; @@ -169,11 +166,10 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = &*F.getEntryBlock().begin(); } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { Instruction *I = cast(U->getUser()); if (isa(I) && !ReturnCaptures) - return ContinueIgnoringReturn; + return false; if (!EarliestCapture) EarliestCapture = I; @@ -181,10 +177,9 @@ struct EarliestCaptures : public CaptureTracker { EarliestCapture = DT.findNearestCommonDominator(EarliestCapture, I); Captured = true; - // Continue analysis, as we need to see all potential captures. However, - // we do not need to follow the instruction result, as this use will - // dominate any captures made through the instruction result.. - return ContinueIgnoringReturn; + // Return false to continue analysis; we need to see all potential + // captures. + return false; } Instruction *EarliestCapture = nullptr; @@ -279,26 +274,25 @@ Instruction *llvm::FindEarliestCapture(const Value *V, Function &F, return CB.EarliestCapture; } -UseCaptureInfo llvm::DetermineUseCaptureKind( - const Use &U, const Value *Base, +UseCaptureKind llvm::DetermineUseCaptureKind( + const Use &U, function_ref IsDereferenceableOrNull) { Instruction *I = dyn_cast(U.getUser()); // TODO: Investigate non-instruction uses. if (!I) - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; switch (I->getOpcode()) { case Instruction::Call: case Instruction::Invoke: { - // TODO(captures): Make this more precise. auto *Call = cast(I); // Not captured if the callee is readonly, doesn't return a copy through // its return value and doesn't unwind (a readonly function can leak bits // by throwing an exception or not depending on the input value). if (Call->onlyReadsMemory() && Call->doesNotThrow() && Call->getType()->isVoidTy()) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; // The pointer is not captured if returned pointer is not captured. // NOTE: CaptureTracking users should not assume that only functions @@ -306,13 +300,13 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // getUnderlyingObject in ValueTracking or DecomposeGEPExpression // in BasicAA also need to know about this property. if (isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(Call, true)) - return UseCaptureInfo::passthrough(); + return UseCaptureKind::PASSTHROUGH; // Volatile operations effectively capture the memory location that they // load and store to. if (auto *MI = dyn_cast(Call)) if (MI->isVolatile()) - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; // Calling a function pointer does not in itself cause the pointer to // be captured. This is a subtle point considering that (for example) @@ -321,27 +315,30 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // captured, even though the loaded value might be the pointer itself // (think of self-referential objects). if (Call->isCallee(&U)) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; // Not captured if only passed via 'nocapture' arguments. assert(Call->isDataOperand(&U) && "Non-callee must be data operand"); - CaptureInfo CI = Call->getCaptureInfo(Call->getDataOperandNo(&U)); - return UseCaptureInfo(CI.getOtherComponents(), CI.getRetComponents()); + if (!Call->doesNotCapture(Call->getDataOperandNo(&U))) { + // The parameter is not marked 'nocapture' - captured. + return UseCaptureKind::MAY_CAPTURE; + } + return UseCaptureKind::NO_CAPTURE; } case Instruction::Load: // Volatile loads make the address observable. if (cast(I)->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; case Instruction::VAArg: // "va-arg" from a pointer does not cause it to be captured. - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; case Instruction::Store: // Stored the pointer - conservatively assume it may be captured. // Volatile stores make the address observable. if (U.getOperandNo() == 0 || cast(I)->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; case Instruction::AtomicRMW: { // atomicrmw conceptually includes both a load and store from // the same location. @@ -350,8 +347,8 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ARMWI = cast(I); if (U.getOperandNo() == 1 || ARMWI->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; } case Instruction::AtomicCmpXchg: { // cmpxchg conceptually includes both a load and store from @@ -361,35 +358,31 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // Volatile stores make the address observable. auto *ACXI = cast(I); if (U.getOperandNo() == 1 || U.getOperandNo() == 2 || ACXI->isVolatile()) - return CaptureComponents::All; - return CaptureComponents::None; + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::NO_CAPTURE; } case Instruction::GetElementPtr: // AA does not support pointers of vectors, so GEP vector splats need to // be considered as captures. if (I->getType()->isVectorTy()) - return CaptureComponents::All; - return UseCaptureInfo::passthrough(); + return UseCaptureKind::MAY_CAPTURE; + return UseCaptureKind::PASSTHROUGH; case Instruction::BitCast: case Instruction::PHI: case Instruction::Select: case Instruction::AddrSpaceCast: // The original value is not captured via this if the new value isn't. - return UseCaptureInfo::passthrough(); + return UseCaptureKind::PASSTHROUGH; case Instruction::ICmp: { unsigned Idx = U.getOperandNo(); unsigned OtherIdx = 1 - Idx; - if (isa(I->getOperand(OtherIdx)) && - cast(I)->isEquality()) { - // TODO(captures): Remove these special cases once we make use of - // captures(address_is_null). - + if (auto *CPN = dyn_cast(I->getOperand(OtherIdx))) { // Don't count comparisons of a no-alias return value against null as // captures. This allows us to ignore comparisons of malloc results // with null, for example. - if (U->getType()->getPointerAddressSpace() == 0) + if (CPN->getType()->getAddressSpace() == 0) if (isNoAliasCall(U.get()->stripPointerCasts())) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; if (!I->getFunction()->nullPointerIsDefined()) { auto *O = I->getOperand(Idx)->stripPointerCastsSameRepresentation(); // Comparing a dereferenceable_or_null pointer against null cannot @@ -397,23 +390,17 @@ UseCaptureInfo llvm::DetermineUseCaptureKind( // valid (in-bounds) pointer. const DataLayout &DL = I->getDataLayout(); if (IsDereferenceableOrNull && IsDereferenceableOrNull(O, DL)) - return CaptureComponents::None; + return UseCaptureKind::NO_CAPTURE; } - - // Check whether this is a comparison of the base pointer against - // null. - if (U.get() == Base) - return CaptureComponents::AddressIsNull; } // Otherwise, be conservative. There are crazy ways to capture pointers - // using comparisons. However, only the address is captured, not the - // provenance. - return CaptureComponents::Address; + // using comparisons. + return UseCaptureKind::MAY_CAPTURE; } default: // Something else - be conservative and say it is captured. - return CaptureComponents::All; + return UseCaptureKind::MAY_CAPTURE; } } @@ -451,26 +438,18 @@ void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker, }; while (!Worklist.empty()) { const Use *U = Worklist.pop_back_val(); - UseCaptureInfo CI = DetermineUseCaptureKind(*U, V, IsDereferenceableOrNull); - if (capturesAnything(CI.UseCC)) { - switch (Tracker->captured(U, CI)) { - case CaptureTracker::Stop: + switch (DetermineUseCaptureKind(*U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: + continue; + case UseCaptureKind::MAY_CAPTURE: + if (Tracker->captured(U)) return; - case CaptureTracker::ContinueIgnoringReturn: - continue; - case CaptureTracker::Continue: - // Fall through to passthrough handling, but only if ResultCC contains - // additional components that UseCC does not. We assume that a - // capture at this point will be strictly more constraining than a - // later capture from following the return value. - if (capturesNothing(CI.ResultCC & ~CI.UseCC)) - continue; - break; - } + continue; + case UseCaptureKind::PASSTHROUGH: + if (!AddUses(U->getUser())) + return; + continue; } - // TODO(captures): We could keep track of ResultCC for the users. - if (capturesAnything(CI.ResultCC) && !AddUses(U->getUser())) - return; } // All uses examined. diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp index d25c1eecaf1ca..59002cd934ab1 100644 --- a/llvm/lib/Analysis/InstructionSimplify.cpp +++ b/llvm/lib/Analysis/InstructionSimplify.cpp @@ -2788,8 +2788,7 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, struct CustomCaptureTracker : public CaptureTracker { bool Captured = false; void tooManyUses() override { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { if (auto *ICmp = dyn_cast(U->getUser())) { // Comparison against value stored in global variable. Given the // pointer does not escape, its value cannot be guessed and stored @@ -2797,11 +2796,11 @@ static Constant *computePointerICmp(CmpPredicate Pred, Value *LHS, Value *RHS, unsigned OtherIdx = 1 - U->getOperandNo(); auto *LI = dyn_cast(ICmp->getOperand(OtherIdx)); if (LI && isa(LI->getPointerOperand())) - return Continue; + return false; } Captured = true; - return Stop; + return true; } }; CustomCaptureTracker Tracker; diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp index 5a22ac8abc3fc..5dc5b025599b1 100644 --- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp +++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp @@ -798,8 +798,13 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, Value *Ptr, PredicatedScalarEvolution &PSE) { // The access function must stride over the innermost loop. if (Lp != AR->getLoop()) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not striding over innermost loop " - << *Ptr << " SCEV: " << *AR << "\n"); + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Not striding over innermost loop "; + if (Ptr) + dbgs() << *Ptr << " "; + + dbgs() << "SCEV: " << *AR << "\n"; + }); return std::nullopt; } @@ -809,8 +814,12 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, // Calculate the pointer stride and check if it is constant. const SCEVConstant *C = dyn_cast(Step); if (!C) { - LLVM_DEBUG(dbgs() << "LAA: Bad stride - Not a constant strided " << *Ptr - << " SCEV: " << *AR << "\n"); + LLVM_DEBUG({ + dbgs() << "LAA: Bad stride - Not a constant strided "; + if (Ptr) + dbgs() << *Ptr << " "; + dbgs() << "SCEV: " << *AR << "\n"; + }); return std::nullopt; } @@ -837,8 +846,8 @@ getStrideFromAddRec(const SCEVAddRecExpr *AR, const Loop *Lp, Type *AccessTy, static bool isNoWrapGEP(Value *Ptr, PredicatedScalarEvolution &PSE, const Loop *L); -/// Check whether \p AR is a non-wrapping AddRec, or if \p Ptr is a non-wrapping -/// GEP. +/// Check whether \p AR is a non-wrapping AddRec. If \p Ptr is not nullptr, use +/// informating from the IR pointer value to determine no-wrap. static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, Value *Ptr, Type *AccessTy, const Loop *L, bool Assume, std::optional Stride = std::nullopt) { @@ -846,12 +855,12 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, if (AR->getNoWrapFlags(SCEV::NoWrapMask)) return true; - if (PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) + if (Ptr && PSE.hasNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW)) return true; // The address calculation must not wrap. Otherwise, a dependence could be // inverted. - if (isNoWrapGEP(Ptr, PSE, L)) + if (Ptr && isNoWrapGEP(Ptr, PSE, L)) return true; // An nusw getelementptr that is an AddRec cannot wrap. If it would wrap, @@ -859,7 +868,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, // location will be larger than half the pointer index type space. In that // case, the GEP would be poison and any memory access dependent on it would // be immediate UB when executed. - if (auto *GEP = dyn_cast(Ptr); + if (auto *GEP = dyn_cast_if_present(Ptr); GEP && GEP->hasNoUnsignedSignedWrap()) return true; @@ -875,7 +884,7 @@ static bool isNoWrap(PredicatedScalarEvolution &PSE, const SCEVAddRecExpr *AR, return true; } - if (Assume) { + if (Ptr && Assume) { PSE.setNoOverflow(Ptr, SCEVWrapPredicate::IncrementNUSW); LLVM_DEBUG(dbgs() << "LAA: Pointer may wrap:\n" << "LAA: Pointer: " << *Ptr << "\n" @@ -1117,6 +1126,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, SmallVector> TranslatedPtrs = findForkedPointer(PSE, StridesMap, Ptr, TheLoop); + assert(!TranslatedPtrs.empty() && "must have some translated pointers"); /// Check whether all pointers can participate in a runtime bounds check. They /// must either be invariant or AddRecs. If ShouldCheckWrap is true, they also @@ -1142,13 +1152,10 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck, // When we run after a failing dependency check we have to make sure // we don't have wrapping pointers. - if (ShouldCheckWrap) { - // Skip wrap checking when translating pointers. - if (TranslatedPtrs.size() > 1) - return false; - - if (!isNoWrap(PSE, AR, Ptr, AccessTy, TheLoop, Assume)) - return false; + if (ShouldCheckWrap && + !isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy, + TheLoop, Assume)) { + return false; } } diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp index 413d9f68e6cc3..f8f5432c73a0f 100644 --- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp @@ -1229,13 +1229,13 @@ static Value *getValueFwdRef(BitcodeReaderValueList &ValueList, unsigned Idx, // This is a reference to a no longer supported constant expression. // Pretend that the constant was deleted, which will replace metadata - // references with undef. + // references with poison. // TODO: This is a rather indirect check. It would be more elegant to use // a separate ErrorInfo for constant materialization failure and thread // the error reporting through getValueFwdRef(). if (Idx < ValueList.size() && ValueList[Idx] && ValueList[Idx]->getType() == Ty) - return UndefValue::get(Ty); + return PoisonValue::get(Ty); return nullptr; } diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp index 1c603f5988ad1..e8d1aba63afb4 100644 --- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp @@ -252,10 +252,10 @@ void EHStreamer::computeCallSiteTable( if (&MBB == &Asm->MF->front() || MBB.isBeginSection()) { // We start a call-site range upon function entry and at the beginning of // every basic block section. - CallSiteRanges.push_back( - {Asm->MBBSectionRanges[MBB.getSectionID()].BeginLabel, - Asm->MBBSectionRanges[MBB.getSectionID()].EndLabel, - Asm->getMBBExceptionSym(MBB), CallSites.size()}); + auto &Range = Asm->MBBSectionRanges[MBB.getSectionID()]; + CallSiteRanges.push_back({Range.BeginLabel, Range.EndLabel, + Asm->getMBBExceptionSym(MBB), + CallSites.size()}); PreviousIsInvoke = false; SawPotentiallyThrowing = false; LastLabel = nullptr; diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bc7cdf38dbc2a..f52447b86a7e4 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16145,7 +16145,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // also recursively replace t184 by t150. SDValue MaybePoisonOperand = N->getOperand(0).getOperand(OpNo); // Don't replace every single UNDEF everywhere with frozen UNDEF, though. - if (MaybePoisonOperand.getOpcode() == ISD::UNDEF) + if (MaybePoisonOperand.isUndef()) continue; // First, freeze each offending operand. SDValue FrozenMaybePoisonOperand = DAG.getFreeze(MaybePoisonOperand); @@ -16173,7 +16173,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { SmallVector Ops(N0->ops()); // Special-handle ISD::UNDEF, each single one of them can be it's own thing. for (SDValue &Op : Ops) { - if (Op.getOpcode() == ISD::UNDEF) + if (Op.isUndef()) Op = DAG.getFreeze(Op); } @@ -24289,7 +24289,7 @@ static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) { if (ISD::BITCAST == Op.getOpcode() && !Op.getOperand(0).getValueType().isVector()) Ops.push_back(Op.getOperand(0)); - else if (ISD::UNDEF == Op.getOpcode()) + else if (Op.isUndef()) Ops.push_back(DAG.getNode(ISD::UNDEF, DL, SVT)); else return SDValue(); @@ -24684,7 +24684,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...)) // -> (BUILD_VECTOR A, B, ..., C, D, ...) auto IsBuildVectorOrUndef = [](const SDValue &Op) { - return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode(); + return Op.isUndef() || ISD::BUILD_VECTOR == Op.getOpcode(); }; if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) { SmallVector Opnds; @@ -24708,7 +24708,7 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) { EVT OpVT = Op.getValueType(); unsigned NumElts = OpVT.getVectorNumElements(); - if (ISD::UNDEF == Op.getOpcode()) + if (Op.isUndef()) Opnds.append(NumElts, DAG.getUNDEF(MinVT)); if (ISD::BUILD_VECTOR == Op.getOpcode()) { diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 80c2de1d99542..de092cba333c2 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6285,7 +6285,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, Flags.setNonNeg(N1->getFlags().hasNonNeg()); return getNode(OpOpcode, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); break; @@ -6305,7 +6305,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, Flags.setNonNeg(N1->getFlags().hasNonNeg()); return getNode(ISD::ZERO_EXTEND, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); @@ -6347,7 +6347,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) return getNode(OpOpcode, DL, VT, N1.getOperand(0), Flags); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // (ext (trunc x)) -> x @@ -6382,7 +6382,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, return getNode(ISD::TRUNCATE, DL, VT, N1.getOperand(0)); return N1.getOperand(0); } - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); if (OpOpcode == ISD::VSCALE && !NewNodesMustHaveLegalTypes) return getVScale(DL, VT, @@ -6400,14 +6400,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::ABS: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid ABS!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getConstant(0, DL, VT); break; case ISD::BSWAP: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BSWAP!"); assert((VT.getScalarSizeInBits() % 16 == 0) && "BSWAP types must be a multiple of 16 bits!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // bswap(bswap(X)) -> X. if (OpOpcode == ISD::BSWAP) @@ -6415,7 +6415,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::BITREVERSE: assert(VT.isInteger() && VT == N1.getValueType() && "Invalid BITREVERSE!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::BITCAST: @@ -6424,7 +6424,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (VT == N1.getValueType()) return N1; // noop conversion. if (OpOpcode == ISD::BITCAST) // bitconv(bitconv(x)) -> bitconv(x) return getNode(ISD::BITCAST, DL, VT, N1.getOperand(0)); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); break; case ISD::SCALAR_TO_VECTOR: @@ -6434,7 +6434,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, N1.getValueType().isInteger() && VT.getVectorElementType().bitsLE(N1.getValueType()))) && "Illegal SCALAR_TO_VECTOR node!"); - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined. if (OpOpcode == ISD::EXTRACT_VECTOR_ELT && @@ -6445,7 +6445,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, break; case ISD::FNEG: // Negation of an unknown bag of bits is still completely undefined. - if (OpOpcode == ISD::UNDEF) + if (N1.isUndef()) return getUNDEF(VT); if (OpOpcode == ISD::FNEG) // --X -> X @@ -13364,7 +13364,7 @@ void BuildVectorSDNode::recastRawBits(bool IsLittleEndian, bool BuildVectorSDNode::isConstant() const { for (const SDValue &Op : op_values()) { unsigned Opc = Op.getOpcode(); - if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP) + if (!Op.isUndef() && Opc != ISD::Constant && Opc != ISD::ConstantFP) return false; } return true; diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp index b98523cac1f2f..1970716485613 100644 --- a/llvm/lib/CodeGen/WinEHPrepare.cpp +++ b/llvm/lib/CodeGen/WinEHPrepare.cpp @@ -251,15 +251,15 @@ void llvm::calculateCXXStateForAsynchEH(const BasicBlock *BB, int State, const BasicBlock *BB = WI->Block; int State = WI->State; delete WI; - if (auto It = EHInfo.BlockToStateMap.find(BB); - It != EHInfo.BlockToStateMap.end() && It->second <= State) + auto [StateIt, Inserted] = EHInfo.BlockToStateMap.try_emplace(BB); + if (!Inserted && StateIt->second <= State) continue; // skip blocks already visited by lower State BasicBlock::const_iterator It = BB->getFirstNonPHIIt(); const llvm::Instruction *TI = BB->getTerminator(); if (It->isEHPad()) State = EHInfo.EHPadStateMap[&*It]; - EHInfo.BlockToStateMap[BB] = State; // Record state, also flag visiting + StateIt->second = State; // Record state, also flag visiting if ((isa(TI) || isa(TI)) && State > 0) { // Retrive the new State diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b5d1bc81b9d95..e2d607368e94b 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -711,20 +711,6 @@ CaptureInfo CallBase::getCaptureInfo(unsigned OpNo) const { return OBU.isDeoptOperandBundle() ? CaptureInfo::none() : CaptureInfo::all(); } -bool CallBase::hasArgumentWithAdditionalReturnCaptureComponents() const { - for (unsigned I = 0, E = arg_size(); I < E; ++I) { - if (!getArgOperand(I)->getType()->isPointerTy()) - continue; - - CaptureInfo CI = getParamAttributes(I).getCaptureInfo(); - if (auto *Fn = dyn_cast(getCalledOperand())) - CI &= Fn->getAttributes().getParamAttrs(I).getCaptureInfo(); - if (capturesAnything(CI.getRetComponents() & ~CI.getOtherComponents())) - return true; - } - return false; -} - //===----------------------------------------------------------------------===// // CallInst Implementation //===----------------------------------------------------------------------===// diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index e38fce764b640..1f94400f7c088 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -170,9 +170,6 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { // TODO: BridgeOS should be included in isOSDarwin. setLibcallName(RTLIB::EXP10_F32, "__exp10f"); setLibcallName(RTLIB::EXP10_F64, "__exp10"); - } else { - setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); } if (TT.isGNUEnvironment() || TT.isOSFuchsia() || diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp index 2d3d70db50c39..ac25d76709726 100644 --- a/llvm/lib/Object/ELFObjectFile.cpp +++ b/llvm/lib/Object/ELFObjectFile.cpp @@ -545,10 +545,6 @@ StringRef ELFObjectFileBase::getAMDGPUCPUName() const { return "gfx90a"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: return "gfx90c"; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: - return "gfx940"; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: - return "gfx941"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: return "gfx942"; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: diff --git a/llvm/lib/Object/GOFFObjectFile.cpp b/llvm/lib/Object/GOFFObjectFile.cpp index 7806953aecd29..a55005e689e62 100644 --- a/llvm/lib/Object/GOFFObjectFile.cpp +++ b/llvm/lib/Object/GOFFObjectFile.cpp @@ -503,8 +503,9 @@ GOFFObjectFile::getSectionContents(DataRefImpl Sec) const { std::copy(CompleteData.data(), CompleteData.data() + TxtDataSize, Data.begin() + TxtDataOffset); } - SectionDataCache[Sec.d.a] = Data; - return ArrayRef(SectionDataCache[Sec.d.a]); + auto &Cache = SectionDataCache[Sec.d.a]; + Cache = Data; + return ArrayRef(Cache); } uint64_t GOFFObjectFile::getSectionAlignment(DataRefImpl Sec) const { diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp index 05e4d85b2ea5d..1f970739c1e7e 100644 --- a/llvm/lib/ObjectYAML/ELFYAML.cpp +++ b/llvm/lib/ObjectYAML/ELFYAML.cpp @@ -609,8 +609,6 @@ void ScalarBitSetTraits::bitset(IO &IO, BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX909, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90A, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX90C, EF_AMDGPU_MACH); - BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX940, EF_AMDGPU_MACH); - BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX941, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX942, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX950, EF_AMDGPU_MACH); BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH); diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index b977b6aaaf619..30d9372e4afd1 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -1059,7 +1059,6 @@ def ProcessorFeatures { FeatureJS, FeatureLSE, FeaturePAuth, FeatureRAS, FeatureRCPC, FeatureCCIDX, FeatureRDM]; - list Oryon = [HasV8_6aOps, FeatureNEON, FeaturePerfMon, FeatureRandGen, FeaturePAuth, FeatureSM4, FeatureSHA2, @@ -1068,6 +1067,7 @@ def ProcessorFeatures { FeatureDotProd, FeatureFPARMv8, FeatureMatMulInt8, FeatureSSBS, FeatureCCIDX, FeatureJS, FeatureLSE, FeatureRAS, FeatureRCPC, FeatureRDM]; + list Grace = !listconcat(NeoverseV2, [FeatureSVE2SM4, FeatureSVEAES, FeatureSVE2SHA3]); // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1151,6 +1151,8 @@ def : ProcessorModel<"cortex-x4", NeoverseV2Model, ProcessorFeatures.X4, [TuneX4]>; def : ProcessorModel<"cortex-x925", NeoverseV2Model, ProcessorFeatures.X925, [TuneX925]>; +def : ProcessorModel<"grace", NeoverseV2Model, ProcessorFeatures.Grace, + [TuneNeoverseV2]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, ProcessorFeatures.NeoverseE1, [TuneNeoverseE1]>; def : ProcessorModel<"neoverse-n1", NeoverseN1Model, @@ -1166,7 +1168,6 @@ def : ProcessorModel<"neoverse-v1", NeoverseV1Model, ProcessorFeatures.NeoverseV1, [TuneNeoverseV1]>; def : ProcessorModel<"neoverse-v2", NeoverseV2Model, ProcessorFeatures.NeoverseV2, [TuneNeoverseV2]>; -def : ProcessorAlias<"grace", "neoverse-v2">; def : ProcessorModel<"neoverse-v3", NeoverseV2Model, ProcessorFeatures.NeoverseV3, [TuneNeoverseV3]>; def : ProcessorModel<"neoverse-v3ae", NeoverseV2Model, diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 6439149d801f6..effc8d2ed6b49 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -1040,12 +1040,6 @@ def FeatureVALUTransUseHazard : SubtargetFeature<"valu-trans-use-hazard", "Hazard when TRANS instructions are closely followed by a use of the result" >; -def FeatureForceStoreSC0SC1 : SubtargetFeature<"force-store-sc0-sc1", - "HasForceStoreSC0SC1", - "true", - "Has SC0 and SC1 on stores" ->; - def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "HasSALUFloatInsts", "true", @@ -1619,28 +1613,6 @@ def FeatureISAVersion9_5_Common : FeatureSet< FeatureAtomicBufferPkAddBF16Inst ])>; -def FeatureISAVersion9_4_0 : FeatureSet< - !listconcat(FeatureISAVersion9_4_Common.Features, - [ - FeatureAddressableLocalMemorySize65536, - FeatureForceStoreSC0SC1, - FeatureFP8Insts, - FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, - FeatureXF32Insts - ])>; - -def FeatureISAVersion9_4_1 : FeatureSet< - !listconcat(FeatureISAVersion9_4_Common.Features, - [ - FeatureAddressableLocalMemorySize65536, - FeatureForceStoreSC0SC1, - FeatureFP8Insts, - FeatureFP8ConversionInsts, - FeatureCvtFP8VOP1Bug, - FeatureXF32Insts - ])>; - def FeatureISAVersion9_4_2 : FeatureSet< !listconcat(FeatureISAVersion9_4_Common.Features, [ diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 3bbbbcf71d8ae..cf3843869808b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -4295,7 +4295,7 @@ AMDGPUInstructionSelector::selectVOP3PModsImpl( // TODO: Handle G_FSUB 0 as fneg // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector. - (void)IsDOT; // DOTs do not use OPSEL on gfx940+, check ST.hasDOTOpSelHazard() + (void)IsDOT; // DOTs do not use OPSEL on gfx942+, check ST.hasDOTOpSelHazard() // Packed instructions do not have abs modifiers. Mods |= SISrcMods::OP_SEL_1; diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td index 9ca853befba73..d3487daee364f 100644 --- a/llvm/lib/Target/AMDGPU/DSInstructions.td +++ b/llvm/lib/Target/AMDGPU/DSInstructions.td @@ -1773,7 +1773,7 @@ def DS_READ_B128_vi : DS_Real_vi<0xff, DS_READ_B128>; def DS_ADD_F64_vi : DS_Real_vi<0x5c, DS_ADD_F64>; def DS_ADD_RTN_F64_vi : DS_Real_vi<0x7c, DS_ADD_RTN_F64>; -// GFX940+. +// GFX942+. def DS_PK_ADD_F16_vi : DS_Real_vi<0x17, DS_PK_ADD_F16>; def DS_PK_ADD_RTN_F16_vi : DS_Real_vi<0xb7, DS_PK_ADD_RTN_F16>; def DS_PK_ADD_BF16_vi : DS_Real_vi<0x18, DS_PK_ADD_BF16>; diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td index ea6e703eba5d9..7988a9ac0ce55 100644 --- a/llvm/lib/Target/AMDGPU/FLATInstructions.td +++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td @@ -814,7 +814,7 @@ defm FLAT_ATOMIC_FMAX : FLAT_Atomic_Pseudo <"flat_atomic_fmax", } // End SubtargetPredicate = isGFX7GFX10GFX11 -// GFX940-, GFX11-only flat instructions. +// GFX942-, GFX11-only flat instructions. let SubtargetPredicate = HasFlatAtomicFaddF32Inst in { defm FLAT_ATOMIC_ADD_F32 : FLAT_Atomic_Pseudo<"flat_atomic_add_f32", VGPR_32, f32>; } // End SubtargetPredicate = HasFlatAtomicFaddF32Inst @@ -2076,7 +2076,7 @@ defm SCRATCH_STORE_DWORDX3 : FLAT_Real_AllAddr_SVE_vi <0x1e>; defm SCRATCH_STORE_DWORDX4 : FLAT_Real_AllAddr_SVE_vi <0x1f>; let SubtargetPredicate = isGFX8GFX9NotGFX940 in { - // These instructions are encoded differently on gfx90* and gfx940. + // These instructions are encoded differently on gfx90* and gfx94*. defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Real_Atomics_vi <0x04d, 0>; defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Real_Atomics_vi <0x04e, 0>; } diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index 827598078af53..1ff75095b220a 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -2292,7 +2292,7 @@ GFX940_SMFMA_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses) { static int GFX940_XDL_N_PassWritesVGPROverlappedSrcABWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 @@ -2600,7 +2600,7 @@ static int GFX940_SMFMA_N_PassWriteVgprVALUWawWaitStates(int NumPasses) { static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 @@ -2610,7 +2610,7 @@ static int GFX940_XDL_N_PassWriteVgprVALUWawWaitStates(int NumPasses, static int GFX940_XDL_N_PassWriteVgprVALUMemExpReadWaitStates(int NumPasses, bool IsGFX950) { - // xdl def cycles | gfx940 | gfx950 + // xdl def cycles | gfx942 | gfx950 // 2 pass | 5 5 // 4 pass | 7 8 // 8 pass | 11 12 diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td index a86c76bb6075e..0b372e29efe67 100644 --- a/llvm/lib/Target/AMDGPU/GCNProcessors.td +++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td @@ -192,15 +192,7 @@ def : ProcessorModel<"gfx90c", SIQuarterSpeedModel, FeatureISAVersion9_0_C.Features >; -def : ProcessorModel<"gfx940", SIDPGFX940FullSpeedModel, - FeatureISAVersion9_4_0.Features ->; - -def : ProcessorModel<"gfx941", SIDPGFX940FullSpeedModel, - FeatureISAVersion9_4_1.Features ->; - -def : ProcessorModel<"gfx942", SIDPGFX940FullSpeedModel, +def : ProcessorModel<"gfx942", SIDPGFX942FullSpeedModel, FeatureISAVersion9_4_2.Features >; @@ -213,8 +205,8 @@ def : ProcessorModel<"gfx9-generic", SIQuarterSpeedModel, FeatureISAVersion9_Generic.Features >; -// [gfx940, gfx941, gfx942] -def : ProcessorModel<"gfx9-4-generic", SIDPGFX940FullSpeedModel, +// [gfx942] +def : ProcessorModel<"gfx9-4-generic", SIDPGFX942FullSpeedModel, FeatureISAVersion9_4_Generic.Features >; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp index b5e8e246825c7..55af5826e90d0 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp @@ -422,10 +422,10 @@ unsigned GCNSubtarget::getBaseMaxNumSGPRs( // Check if maximum number of SGPRs was explicitly requested using // "amdgpu-num-sgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-sgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs); + if (Requested != MaxNumSGPRs) { // Make sure requested value does not violate subtarget's specifications. if (Requested && (Requested <= ReservedNumSGPRs)) Requested = 0; @@ -504,10 +504,9 @@ unsigned GCNSubtarget::getBaseMaxNumVGPRs( // Check if maximum number of VGPRs was explicitly requested using // "amdgpu-num-vgpr" attribute. - if (F.hasFnAttribute("amdgpu-num-vgpr")) { - unsigned Requested = - F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); - + unsigned Requested = + F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", MaxNumVGPRs); + if (Requested != MaxNumVGPRs) { if (hasGFX90AInsts()) Requested *= 2; diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index 342b211199dca..6664a70572ded 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -246,7 +246,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasMADIntraFwdBug = false; bool HasVOPDInsts = false; bool HasVALUTransUseHazard = false; - bool HasForceStoreSC0SC1 = false; bool HasRequiredExportPriority = false; bool HasVmemWriteVgprInOrder = false; bool HasAshrPkInsts = false; @@ -1264,8 +1263,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasCvtScaleForwardingHazard() const { return GFX950Insts; } - bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; } - bool requiresCodeObjectV6() const { return RequiresCOV6; } bool hasVALUMaskWriteHazard() const { return getGeneration() == GFX11; } @@ -1297,11 +1294,11 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasPackedTID() const { return HasPackedTID; } - // GFX940 is a derivation to GFX90A. hasGFX940Insts() being true implies that + // GFX94* is a derivation to GFX90A. hasGFX940Insts() being true implies that // hasGFX90AInsts is also true. bool hasGFX940Insts() const { return GFX940Insts; } - // GFX950 is a derivation to GFX940. hasGFX950Insts() implies that + // GFX950 is a derivation to GFX94*. hasGFX950Insts() implies that // hasGFX940Insts and hasGFX90AInsts are also true. bool hasGFX950Insts() const { return GFX950Insts; } diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp index 059bab5838526..4a4ad712e304d 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp @@ -93,8 +93,6 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) { case ELF::EF_AMDGPU_MACH_AMDGCN_GFX909: AK = GK_GFX909; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A: AK = GK_GFX90A; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C: AK = GK_GFX90C; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX940: AK = GK_GFX940; break; - case ELF::EF_AMDGPU_MACH_AMDGCN_GFX941: AK = GK_GFX941; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX942: AK = GK_GFX942; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX950: AK = GK_GFX950; break; case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break; @@ -180,8 +178,6 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) { case GK_GFX909: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX909; case GK_GFX90A: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90A; case GK_GFX90C: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX90C; - case GK_GFX940: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX940; - case GK_GFX941: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX941; case GK_GFX942: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX942; case GK_GFX950: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX950; case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010; diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h index f812ae652b63d..721601efcc804 100644 --- a/llvm/lib/Target/AMDGPU/SIDefines.h +++ b/llvm/lib/Target/AMDGPU/SIDefines.h @@ -542,7 +542,7 @@ enum Id { // HwRegCode, (6) [5:0] ID_EXCP_FLAG_USER = 18, ID_TRAP_CTRL = 19, - // GFX940 specific registers + // GFX94* specific registers ID_XCC_ID = 20, ID_SQ_PERF_SNAPSHOT_DATA = 21, ID_SQ_PERF_SNAPSHOT_DATA1 = 22, diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index ab396929162d0..fa15e73bc31d5 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -203,6 +203,8 @@ static unsigned macToMad(unsigned Opc) { return AMDGPU::V_FMA_F32_e64; case AMDGPU::V_FMAC_F16_e64: return AMDGPU::V_FMA_F16_gfx9_e64; + case AMDGPU::V_FMAC_F16_t16_e64: + return AMDGPU::V_FMA_F16_gfx9_t16_e64; case AMDGPU::V_FMAC_F16_fake16_e64: return AMDGPU::V_FMA_F16_gfx9_fake16_e64; case AMDGPU::V_FMAC_LEGACY_F32_e64: diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index e09b310d107ac..909ad07782fc6 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -16823,39 +16823,39 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // safe. The message phrasing also should be better. if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) { if (AS == AMDGPUAS::FLAT_ADDRESS) { - // gfx940, gfx12 + // gfx942, gfx12 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) { - // gfx90a, gfx940, gfx12 + // gfx90a, gfx942, gfx12 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // gfx940, gfx12 + // gfx942, gfx12 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) { - // gfx90a, gfx940, gfx12 + // gfx90a, gfx942, gfx12 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // While gfx90a/gfx940 supports v2bf16 for global/flat, it does not for + // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for // buffer. gfx12 does have the buffer version. if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty)) return ReportUnsafeHWInst(AtomicExpansionKind::None); } - // global and flat atomic fadd f64: gfx90a, gfx940. + // global and flat atomic fadd f64: gfx90a, gfx942. if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy()) return ReportUnsafeHWInst(AtomicExpansionKind::None); if (AS != AMDGPUAS::FLAT_ADDRESS) { if (Ty->isFloatTy()) { - // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx940, + // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942, // gfx11+. if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); - // global/buffer atomic fadd f32 rtn: gfx90a, gfx940, gfx11+. + // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+. if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts()) return ReportUnsafeHWInst(AtomicExpansionKind::None); } else { @@ -16867,7 +16867,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { } } - // flat atomic fadd f32: gfx940, gfx11+. + // flat atomic fadd f32: gfx942, gfx11+. if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) { if (Subtarget->hasFlatAtomicFaddF32Inst()) return ReportUnsafeHWInst(AtomicExpansionKind::None); @@ -16906,7 +16906,7 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const { // float, double restored in gfx10. // double removed again in gfx11, so only f32 for gfx11/gfx12. // - // For gfx9, gfx90a and gfx940 support f64 for global (same as fadd), but + // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but // no f32. if (AS == AMDGPUAS::FLAT_ADDRESS) { if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy()) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 7dace11d208a0..2cf6de73fa90c 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3461,6 +3461,62 @@ std::optional SIInstrInfo::extractSubregFromImm(int64_t Imm, llvm_unreachable("covered subregister switch"); } +static unsigned getNewFMAAKInst(const GCNSubtarget &ST, unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAD_F16_e64: + return AMDGPU::V_MADAK_F16; + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAD_F32_e64: + return AMDGPU::V_MADAK_F32; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMA_F32_e64: + return AMDGPU::V_FMAAK_F32; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMA_F16_e64: + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMAAK_F16_t16 + : AMDGPU::V_FMAAK_F16_fake16 + : AMDGPU::V_FMAAK_F16; + default: + llvm_unreachable("invalid instruction"); + } +} + +static unsigned getNewFMAMKInst(const GCNSubtarget &ST, unsigned Opc) { + switch (Opc) { + case AMDGPU::V_MAC_F16_e32: + case AMDGPU::V_MAC_F16_e64: + case AMDGPU::V_MAD_F16_e64: + return AMDGPU::V_MADMK_F16; + case AMDGPU::V_MAC_F32_e32: + case AMDGPU::V_MAC_F32_e64: + case AMDGPU::V_MAD_F32_e64: + return AMDGPU::V_MADMK_F32; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_FMAC_F32_e64: + case AMDGPU::V_FMA_F32_e64: + return AMDGPU::V_FMAMK_F32; + case AMDGPU::V_FMAC_F16_e32: + case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: + case AMDGPU::V_FMA_F16_e64: + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMAMK_F16_t16 + : AMDGPU::V_FMAMK_F16_fake16 + : AMDGPU::V_FMAMK_F16; + default: + llvm_unreachable("invalid instruction"); + } +} + bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg, MachineRegisterInfo *MRI) const { if (!MRI->hasOneNonDBGUse(Reg)) @@ -3533,6 +3589,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || + Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. @@ -3550,12 +3607,6 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, MachineOperand *Src0 = &UseMI.getOperand(Src0Idx); - bool IsF32 = Opc == AMDGPU::V_MAD_F32_e64 || Opc == AMDGPU::V_MAC_F32_e64 || - Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64; - bool IsFMA = - Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -3586,18 +3637,15 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, !isInlineConstant(Def->getOperand(1))) return false; - unsigned NewOpc = - IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16) - : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); + unsigned NewOpc = getNewFMAMKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite - // would also require restricting their register classes. For now - // just bail out. - if (NewOpc == AMDGPU::V_FMAMK_F16_fake16) + // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16 + // takes VGPR_32_Lo128 operands, so the rewrite would also require + // restricting their register classes. For now just bail out. + if (NewOpc == AMDGPU::V_FMAMK_F16_t16 || + NewOpc == AMDGPU::V_FMAMK_F16_fake16) return false; const std::optional SubRegImm = extractSubregFromImm( @@ -3613,7 +3661,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setIsKill(RegSrc->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3666,25 +3714,22 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, } } - unsigned NewOpc = - IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16) - : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); + unsigned NewOpc = getNewFMAAKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite - // would also require restricting their register classes. For now - // just bail out. - if (NewOpc == AMDGPU::V_FMAAK_F16_fake16) + // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16 + // takes VGPR_32_Lo128 operands, so the rewrite would also require + // restricting their register classes. For now just bail out. + if (NewOpc == AMDGPU::V_FMAAK_F16_t16 || + NewOpc == AMDGPU::V_FMAAK_F16_fake16) return false; // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3874,8 +3919,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) { return AMDGPU::V_FMA_LEGACY_F32_e64; case AMDGPU::V_FMAC_F16_e32: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: - return ST.hasTrue16BitInsts() ? AMDGPU::V_FMA_F16_gfx9_fake16_e64 + return ST.hasTrue16BitInsts() ? ST.useRealTrue16Insts() + ? AMDGPU::V_FMA_F16_gfx9_t16_e64 + : AMDGPU::V_FMA_F16_gfx9_fake16_e64 : AMDGPU::V_FMA_F16_gfx9_e64; case AMDGPU::V_FMAC_F32_e32: case AMDGPU::V_FMAC_F32_e64: @@ -3941,21 +3989,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } - assert( - Opc != AMDGPU::V_FMAC_F16_fake16_e32 && - "V_FMAC_F16_fake16_e32 is not supported and not expected to be present " - "pre-RA"); + assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 && + Opc != AMDGPU::V_FMAC_F16_fake16_e32 && + "V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be " + "present pre-RA"); // Handle MAC/FMAC. - bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64; - bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || - Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || - Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || - Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_fake16_e64 || - Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || Opc == AMDGPU::V_MAC_LEGACY_F32_e64 || @@ -3968,6 +4007,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return nullptr; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: @@ -4052,11 +4092,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, int64_t Imm; if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { - unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16) - : AMDGPU::V_FMAAK_F32) - : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); + unsigned NewOpc = getNewFMAAKInst(ST, Opc); if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) .add(*Dst) @@ -4071,11 +4107,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } } - unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16) - : AMDGPU::V_FMAMK_F32) - : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); + unsigned NewOpc = getNewFMAMKInst(ST, Opc); if (!Src0Literal && getFoldableImm(Src1, Imm, &DefMI)) { if (pseudoToMCOpcode(NewOpc) != -1) { MIB = BuildMI(MBB, MI, MI.getDebugLoc(), get(NewOpc)) @@ -4513,6 +4545,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: case AMDGPU::V_FMAC_F16_e64: + case AMDGPU::V_FMAC_F16_t16_e64: case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_FMAC_F32_e64: case AMDGPU::V_FMAC_F64_e64: @@ -5569,7 +5602,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; - case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64; + case AMDGPU::S_FMAC_F16: + return ST.useRealTrue16Insts() ? AMDGPU::V_FMAC_F16_t16_e64 + : AMDGPU::V_FMAC_F16_fake16_e64; case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32; case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32; case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index 6e08aff24ec23..3faf0795157dc 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3287,6 +3287,14 @@ def : GCNPat < (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >; +let True16Predicate = UseRealTrue16Insts in +def : GCNPat < + (fma (f16 (VOP3NoMods f16:$src0)), + (f16 (VOP3NoMods f16:$src1)), + (f16 (VOP3NoMods f16:$src2))), + (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + SRCMODS.NONE, $src2) +>; let True16Predicate = UseFakeTrue16Insts in def : GCNPat < (fma (f16 (VOP3NoMods f16:$src0)), diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp index be6cff873532b..34953f9c08db7 100644 --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -359,11 +359,6 @@ class SICacheControl { /// Virtual destructor to allow derivations to be deleted. virtual ~SICacheControl() = default; - - virtual bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const { - return false; - } }; class SIGfx6CacheControl : public SICacheControl { @@ -492,7 +487,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { } public: - SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {}; bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI, @@ -518,20 +512,6 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl { bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering, Position Pos) const override; - - bool tryForceStoreSC0SC1(const SIMemOpInfo &MOI, - MachineBasicBlock::iterator &MI) const override { - bool Changed = false; - if (ST.hasForceStoreSC0SC1() && - (MOI.getInstrAddrSpace() & (SIAtomicAddrSpace::SCRATCH | - SIAtomicAddrSpace::GLOBAL | - SIAtomicAddrSpace::OTHER)) != - SIAtomicAddrSpace::NONE) { - Changed |= enableSC0Bit(MI); - Changed |= enableSC1Bit(MI); - } - return Changed; - } }; class SIGfx10CacheControl : public SIGfx7CacheControl { @@ -2821,7 +2801,6 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) { Changed |= expandLoad(*MOI, MI); else if (const auto &MOI = MOA.getStoreInfo(MI)) { Changed |= expandStore(*MOI, MI); - Changed |= CC->tryForceStoreSC0SC1(*MOI, MI); } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) Changed |= expandAtomicFence(*MOI, MI); else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td index 117add324db56..2a374b360b04a 100644 --- a/llvm/lib/Target/AMDGPU/SISchedule.td +++ b/llvm/lib/Target/AMDGPU/SISchedule.td @@ -94,7 +94,7 @@ class SISchedMachineModel : SchedMachineModel { def SIFullSpeedModel : SISchedMachineModel; def SIQuarterSpeedModel : SISchedMachineModel; def SIDPFullSpeedModel : SISchedMachineModel; -def SIDPGFX940FullSpeedModel : SISchedMachineModel; +def SIDPGFX942FullSpeedModel : SISchedMachineModel; def SIDPGFX950FullSpeedModel : SISchedMachineModel; def GFX10SpeedModel : SISchedMachineModel; def GFX11SpeedModel : SISchedMachineModel; @@ -276,7 +276,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; } // End SchedModel = SIDPFullSpeedModel -let SchedModel = SIDPGFX940FullSpeedModel in { +let SchedModel = SIDPGFX942FullSpeedModel in { defm : SICommonWriteRes; @@ -308,7 +308,7 @@ def : InstRW<[Write8PassDGEMM, MIMFMARead], (instregex "^V_MFMA_.64_16X16X")>; def : InstRW<[Write4PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_16X16X")>; def : InstRW<[Write8PassMAI, MIMFMARead], (instregex "^V_SMFMAC_.32_32X32X")>; -} // End SchedModel = SIDPGFX940FullSpeedModel +} // End SchedModel = SIDPGFX942FullSpeedModel let SchedModel = SIDPGFX950FullSpeedModel in { diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 979812e07fc3f..f03cde455f295 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -455,9 +455,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + NewOpcode = AMDGPU::V_FMAAK_F16; + break; + case AMDGPU::V_FMA_F16_gfx9_t16_e64: + NewOpcode = AMDGPU::V_FMAAK_F16_t16; + break; case AMDGPU::V_FMA_F16_gfx9_fake16_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 - : AMDGPU::V_FMAAK_F16; + NewOpcode = AMDGPU::V_FMAAK_F16_fake16; break; } } @@ -485,9 +489,13 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: + NewOpcode = AMDGPU::V_FMAMK_F16; + break; + case AMDGPU::V_FMA_F16_gfx9_t16_e64: + NewOpcode = AMDGPU::V_FMAMK_F16_t16; + break; case AMDGPU::V_FMA_F16_gfx9_fake16_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 - : AMDGPU::V_FMAMK_F16; + NewOpcode = AMDGPU::V_FMAMK_F16_fake16; break; } } @@ -959,6 +967,7 @@ bool SIShrinkInstructions::run(MachineFunction &MF) { MI.getOpcode() == AMDGPU::V_MAD_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_e64 || + MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_t16_e64 || MI.getOpcode() == AMDGPU::V_FMA_F16_gfx9_fake16_e64) { shrinkMadFma(MI); continue; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp index a8e4ce133ffbc..e433b85489e6e 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp @@ -216,7 +216,7 @@ static constexpr CustomOperand Operands[] = { {{"HW_REG_SCRATCH_BASE_HI"}, ID_FLAT_SCR_HI, isGFX12Plus}, {{"HW_REG_SHADER_CYCLES_LO"}, ID_SHADER_CYCLES, isGFX12Plus}, - // GFX940 specific registers + // GFX942 specific registers {{"HW_REG_XCC_ID"}, ID_XCC_ID, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_DATA"}, ID_SQ_PERF_SNAPSHOT_DATA, isGFX940}, {{"HW_REG_SQ_PERF_SNAPSHOT_DATA1"}, ID_SQ_PERF_SNAPSHOT_DATA1, isGFX940}, diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index eb1491feb611e..c7ed73d0e95f7 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -767,6 +767,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, setLibcallName(LC.Op, LC.Name); setLibcallCallingConv(LC.Op, LC.CC); } + } else if (!Subtarget->isTargetMachO()) { + setLibcallName(RTLIB::FPROUND_F32_F16, "__gnu_f2h_ieee"); + setLibcallName(RTLIB::FPEXT_F16_F32, "__gnu_h2f_ieee"); } if (Subtarget->isThumb1Only()) diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp index d66e3e306d2ff..1710488e4e292 100644 --- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1886,11 +1886,6 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, setLibcallName(RTLIB::SQRT_F32, "__hexagon_fast2_sqrtf"); else setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf"); - - // Routines to handle fp16 storage type. - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - setLibcallName(RTLIB::FPROUND_F64_F16, "__truncdfhf2"); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); } const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f20502521829e..ed7963f35a7c7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -7704,6 +7704,48 @@ defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR; defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR; +multiclass TCGEN05_SHIFT_INTR { + def NAME : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr), + !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"), + [(Intr Int32Regs:$tmem_addr)]>, + Requires<[hasTcgen05Instructions]>; +} +defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; +defm TCGEN05_SHIFT_CG2: TCGEN05_SHIFT_INTR<"2", int_nvvm_tcgen05_shift_down_cg2>; + +multiclass TCGEN05_CP_INTR { + defvar dst_fmt = !if(!eq(src_fmt, ""), "", ".b8x16"); + defvar fmt_asm = StrJoin<".", [dst_fmt, src_fmt]>.ret; + defvar fmt_intr = StrJoin<"_", [src_fmt]>.ret; + + defvar shape_mc_asm = StrJoin<".", [shape, mc]>.ret; + defvar shape_mc_intr = !subst("::", "_", !subst(".", "_", shape_mc_asm)); + + defvar intr_prefix = StrJoin<"_", ["int_nvvm_tcgen05_cp", shape_mc_intr, fmt_intr]>.ret; + defvar IntrCG1 = !cast(intr_prefix # "_cg1"); + defvar IntrCG2 = !cast(intr_prefix # "_cg2"); + + def NAME # _cg1 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; + def NAME # _cg2 : NVPTXInst<(outs), + (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", + [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + Requires<[hasTcgen05Instructions]>; +} + +foreach src_fmt = ["", "b6x16_p32", "b4x16_p64"] in { + defm TCGEN05_CP_128x256b # src_fmt : TCGEN05_CP_INTR<"128x256b", src_fmt>; + defm TCGEN05_CP_4x256b # src_fmt : TCGEN05_CP_INTR<"4x256b", src_fmt>; + defm TCGEN05_CP_128x128b # src_fmt : TCGEN05_CP_INTR<"128x128b", src_fmt>; + defm TCGEN05_CP_64x128_1 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::02_13">; + defm TCGEN05_CP_64x128_2 # src_fmt : TCGEN05_CP_INTR<"64x128b", src_fmt, "warpx2::01_23">; + defm TCGEN05_CP_32x128 # src_fmt : TCGEN05_CP_INTR<"32x128b", src_fmt, "warpx4">; +} } // isConvergent let hasSideEffects = 1 in { diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp index 4720928f472b3..d6c8e8d506799 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -197,11 +197,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, } setOperationAction(ISD::UADDO, RegVT, Custom); - setOperationAction(ISD::USUBO, RegVT, Custom); - - // PowerPC uses addo_carry,subo_carry to propagate carry. - setOperationAction(ISD::UADDO_CARRY, RegVT, Custom); - setOperationAction(ISD::USUBO_CARRY, RegVT, Custom); // On P10, the default lowering generates better code using the // setbc instruction. @@ -265,6 +260,15 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setIndexedStoreAction(ISD::PRE_INC, MVT::f64, Legal); } + // PowerPC uses ADDC/ADDE/SUBC/SUBE to propagate carry. + const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 }; + for (MVT VT : ScalarIntVTs) { + setOperationAction(ISD::ADDC, VT, Legal); + setOperationAction(ISD::ADDE, VT, Legal); + setOperationAction(ISD::SUBC, VT, Legal); + setOperationAction(ISD::SUBE, VT, Legal); + } + if (Subtarget.useCRBits()) { setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); @@ -1850,14 +1854,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { return "PPCISD::SETBC"; case PPCISD::SETBCR: return "PPCISD::SETBCR"; - case PPCISD::ADDC: - return "PPCISD::ADDC"; - case PPCISD::ADDE: - return "PPCISD::ADDE"; - case PPCISD::SUBC: - return "PPCISD::SUBC"; - case PPCISD::SUBE: - return "PPCISD::SUBE"; } return nullptr; } @@ -12017,74 +12013,43 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const { llvm_unreachable("ERROR:Should return for all cases within swtich."); } -static SDValue ConvertCarryValueToCarryFlag(EVT SumType, SDValue Value, - SelectionDAG &DAG, - const PPCSubtarget &STI) { - SDLoc DL(Value); - if (STI.useCRBits()) - Value = DAG.getNode(ISD::SELECT, DL, SumType, Value, - DAG.getConstant(1, DL, SumType), - DAG.getConstant(0, DL, SumType)); - else - Value = DAG.getZExtOrTrunc(Value, DL, SumType); - SDValue Sum = DAG.getNode(PPCISD::ADDC, DL, DAG.getVTList(SumType, MVT::i32), - Value, DAG.getAllOnesConstant(DL, SumType)); - return Sum.getValue(1); -} +SDValue PPCTargetLowering::LowerUaddo(SDValue Op, SelectionDAG &DAG) const { + // Default to target independent lowering if there is a logical user of the + // carry-bit. + for (SDNode *U : Op->users()) { + if (U->getOpcode() == ISD::SELECT) + return SDValue(); + if (ISD::isBitwiseLogicOp(U->getOpcode())) { + for (unsigned i = 0, ie = U->getNumOperands(); i != ie; ++i) { + if (U->getOperand(i).getOpcode() != ISD::UADDO && + U->getOperand(i).getOpcode() != ISD::MERGE_VALUES) + return SDValue(); + } + } + } + SDValue LHS = Op.getOperand(0); + SDValue RHS = Op.getOperand(1); + SDLoc dl(Op); -static SDValue ConvertCarryFlagToCarryValue(EVT SumType, SDValue Flag, - EVT CarryType, SelectionDAG &DAG, - const PPCSubtarget &STI) { - SDLoc DL(Flag); - SDValue Zero = DAG.getConstant(0, DL, SumType); - SDValue Carry = DAG.getNode( - PPCISD::ADDE, DL, DAG.getVTList(SumType, MVT::i32), Zero, Zero, Flag); - if (STI.useCRBits()) - return DAG.getSetCC(DL, CarryType, Carry, Zero, ISD::SETNE); - return DAG.getZExtOrTrunc(Carry, DL, CarryType); -} + // Default to target independent lowering for special cases handled there. + if (isOneConstant(RHS) || isAllOnesConstant(RHS)) + return SDValue(); -SDValue PPCTargetLowering::LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getNode()->getValueType(0); - SDLoc DL(Op); - SDNode *N = Op.getNode(); - EVT VT = N->getValueType(0); - EVT CarryType = N->getValueType(1); - unsigned Opc = N->getOpcode(); - bool IsAdd = Opc == ISD::UADDO; - Opc = IsAdd ? PPCISD::ADDC : PPCISD::SUBC; - SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), - N->getOperand(0), N->getOperand(1)); - SDValue Carry = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, - DAG, Subtarget); - if (!IsAdd) - Carry = DAG.getNode(ISD::XOR, DL, CarryType, Carry, - DAG.getAllOnesConstant(DL, CarryType)); - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, Carry); -} - -SDValue PPCTargetLowering::LowerADDSUBO_CARRY(SDValue Op, - SelectionDAG &DAG) const { - SDLoc DL(Op); - SDNode *N = Op.getNode(); - unsigned Opc = N->getOpcode(); - EVT VT = N->getValueType(0); - EVT CarryType = N->getValueType(1); - SDValue CarryOp = N->getOperand(2); - bool IsAdd = Opc == ISD::UADDO_CARRY; - Opc = IsAdd ? PPCISD::ADDE : PPCISD::SUBE; - if (!IsAdd) - CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, - DAG.getAllOnesConstant(DL, CarryOp.getValueType())); - CarryOp = ConvertCarryValueToCarryFlag(VT, CarryOp, DAG, Subtarget); - SDValue Sum = DAG.getNode(Opc, DL, DAG.getVTList(VT, MVT::i32), - Op.getOperand(0), Op.getOperand(1), CarryOp); - CarryOp = ConvertCarryFlagToCarryValue(VT, Sum.getValue(1), CarryType, DAG, - Subtarget); - if (!IsAdd) - CarryOp = DAG.getNode(ISD::XOR, DL, CarryOp.getValueType(), CarryOp, - DAG.getAllOnesConstant(DL, CarryOp.getValueType())); - return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, CarryOp); + SDValue ADDC; + SDValue Overflow; + SDVTList VTs = Op.getNode()->getVTList(); + + ADDC = DAG.getNode(ISD::ADDC, dl, DAG.getVTList(VT, MVT::Glue), LHS, RHS); + Overflow = DAG.getNode(ISD::ADDE, dl, DAG.getVTList(VT, MVT::Glue), + DAG.getConstant(0, dl, VT), DAG.getConstant(0, dl, VT), + ADDC.getValue(1)); + SDValue OverflowTrunc = + DAG.getNode(ISD::TRUNCATE, dl, Op.getNode()->getValueType(1), Overflow); + SDValue Res = + DAG.getNode(ISD::MERGE_VALUES, dl, VTs, ADDC.getValue(0), OverflowTrunc); + return Res; } SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { @@ -12115,8 +12080,8 @@ SDValue PPCTargetLowering::LowerSSUBO(SDValue Op, SelectionDAG &DAG) const { /// SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { - default: - llvm_unreachable("Wasn't expecting to be able to lower this!"); + default: llvm_unreachable("Wasn't expecting to be able to lower this!"); + case ISD::UADDO: return LowerUaddo(Op, DAG); case ISD::FPOW: return lowerPow(Op, DAG); case ISD::FSIN: return lowerSin(Op, DAG); case ISD::FCOS: return lowerCos(Op, DAG); @@ -12209,12 +12174,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return LowerATOMIC_LOAD_STORE(Op, DAG); case ISD::IS_FPCLASS: return LowerIS_FPCLASS(Op, DAG); - case ISD::UADDO: - case ISD::USUBO: - return LowerADDSUBO(Op, DAG); - case ISD::UADDO_CARRY: - case ISD::USUBO_CARRY: - return LowerADDSUBO_CARRY(Op, DAG); } } @@ -16150,21 +16109,6 @@ static bool isStoreConditional(SDValue Intrin, unsigned &StoreWidth) { return true; } -static SDValue DAGCombineAddc(SDNode *N, - llvm::PPCTargetLowering::DAGCombinerInfo &DCI) { - if (N->getOpcode() == PPCISD::ADDC && N->hasAnyUseOfValue(1)) { - // (ADDC (ADDE 0, 0, C), -1) -> C - SDValue LHS = N->getOperand(0); - SDValue RHS = N->getOperand(1); - if (LHS->getOpcode() == PPCISD::ADDE && - isNullConstant(LHS->getOperand(0)) && - isNullConstant(LHS->getOperand(1)) && isAllOnesConstant(RHS)) { - return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2)); - } - } - return SDValue(); -} - SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -16953,8 +16897,6 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, } case ISD::BUILD_VECTOR: return DAGCombineBuildVector(N, DCI); - case PPCISD::ADDC: - return DAGCombineAddc(N, DCI); } return SDValue(); @@ -17008,16 +16950,6 @@ void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero = 0xFFFF0000; break; } - case PPCISD::ADDE: { - if (Op.getResNo() == 0) { - // (0|1), _ = ADDE 0, 0, CARRY - SDValue LHS = Op.getOperand(0); - SDValue RHS = Op.getOperand(1); - if (isNullConstant(LHS) && isNullConstant(RHS)) - Known.Zero = ~1ULL; - } - break; - } case ISD::INTRINSIC_WO_CHAIN: { switch (Op.getConstantOperandVal(0)) { default: break; @@ -18287,8 +18219,7 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, return SDValue(); SDLoc DL(N); - EVT CarryType = Subtarget.useCRBits() ? MVT::i1 : MVT::i32; - SDVTList VTs = DAG.getVTList(MVT::i64, CarryType); + SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Glue); SDValue Cmp = RHS.getOperand(0); SDValue Z = Cmp.getOperand(0); auto *Constant = cast(Cmp.getOperand(1)); @@ -18306,14 +18237,11 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Addc = - DAG.getNode(ISD::UADDO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64), - DAG.getConstant(0, DL, CarryType)); - return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, - DAG.getConstant(0, DL, MVT::i64), + SDValue Addc = DAG.getNode(ISD::ADDC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + AddOrZ, DAG.getAllOnesConstant(DL, MVT::i64)); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), SDValue(Addc.getNode(), 1)); - } + } case ISD::SETEQ: { // when C == 0 // --> addze X, (subfic Z, 0).carry @@ -18324,15 +18252,11 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG, SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Z, DAG.getConstant(NegConstant, DL, MVT::i64)); SDValue AddOrZ = NegConstant != 0 ? Add : Z; - SDValue Subc = - DAG.getNode(ISD::USUBO_CARRY, DL, DAG.getVTList(MVT::i64, CarryType), - DAG.getConstant(0, DL, MVT::i64), AddOrZ, - DAG.getConstant(0, DL, CarryType)); - SDValue Invert = DAG.getNode(ISD::XOR, DL, CarryType, Subc.getValue(1), - DAG.getAllOnesConstant(DL, CarryType)); - return DAG.getNode(ISD::UADDO_CARRY, DL, VTs, LHS, - DAG.getConstant(0, DL, MVT::i64), Invert); - } + SDValue Subc = DAG.getNode(ISD::SUBC, DL, DAG.getVTList(MVT::i64, MVT::Glue), + DAG.getConstant(0, DL, MVT::i64), AddOrZ); + return DAG.getNode(ISD::ADDE, DL, VTs, LHS, DAG.getConstant(0, DL, MVT::i64), + SDValue(Subc.getNode(), 1)); + } } return SDValue(); diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h index 2d86a224b54c1..514329bbe92d7 100644 --- a/llvm/lib/Target/PowerPC/PPCISelLowering.h +++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -161,12 +161,6 @@ namespace llvm { SRA, SHL, - /// These nodes represent PPC arithmetic operations with carry. - ADDC, - ADDE, - SUBC, - SUBE, - /// FNMSUB - Negated multiply-subtract instruction. FNMSUB, @@ -1286,6 +1280,7 @@ namespace llvm { SDValue LowerGlobalTLSAddressLinux(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerUaddo(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const; @@ -1321,8 +1316,6 @@ namespace llvm { SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerIS_FPCLASS(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerADDSUBO(SDValue Op, SelectionDAG &DAG) const; SDValue lowerToLibCall(const char *LibCallName, SDValue Op, SelectionDAG &DAG) const; SDValue lowerLibCallBasedOnType(const char *LibCallFloatName, diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td index 4205b3086a3c9..bcac0de55d9d3 100644 --- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td @@ -760,13 +760,13 @@ def STFDXTLS : XForm_8<31, 727, (outs), (ins f8rc:$RST, ptr_rc_nor0:$RA, tlsreg: let isCommutable = 1 in defm ADDC8 : XOForm_1rc<31, 10, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCaddc i64:$RA, i64:$RB))]>, + [(set i64:$RT, (addc i64:$RA, i64:$RB))]>, PPC970_DGroup_Cracked; let Defs = [CARRY] in def ADDIC8 : DForm_2<12, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (PPCaddc i64:$RA, imm64SExt16:$D))]>; + [(set i64:$RST, (addc i64:$RA, imm64SExt16:$D))]>; def ADDI8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), "addi $RST, $RA, $D", IIC_IntSimple, [(set i64:$RST, (add i64:$RA, imm64SExt16:$D))]>; @@ -782,11 +782,11 @@ def LA8 : DForm_2<14, (outs g8rc:$RST), (ins g8rc_nox0:$RA, s16imm64:$D), let Defs = [CARRY] in { def SUBFIC8: DForm_2< 8, (outs g8rc:$RST), (ins g8rc:$RA, s16imm64:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i64:$RST, (PPCsubc imm64SExt16:$D, i64:$RA))]>; + [(set i64:$RST, (subc imm64SExt16:$D, i64:$RA))]>; } defm SUBFC8 : XOForm_1rc<31, 8, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCsubc i64:$RB, i64:$RA))]>, + [(set i64:$RT, (subc i64:$RB, i64:$RA))]>, PPC970_DGroup_Cracked; defm SUBF8 : XOForm_1rx<31, 40, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subf", "$RT, $RA, $RB", IIC_IntGeneral, @@ -798,22 +798,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE8 : XOForm_1rc<31, 138, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, i64:$RB, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, i64:$RB))]>; defm ADDME8 : XOForm_3rc<31, 234, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, -1, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, -1))]>; defm ADDZE8 : XOForm_3rc<31, 202, 0, (outs g8rc:$RT), (ins g8rc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCadde i64:$RA, 0, CARRY))]>; + [(set i64:$RT, (adde i64:$RA, 0))]>; defm SUBFE8 : XOForm_1rc<31, 136, 0, (outs g8rc:$RT), (ins g8rc:$RA, g8rc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i64:$RT, (PPCsube i64:$RB, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube i64:$RB, i64:$RA))]>; defm SUBFME8 : XOForm_3rc<31, 232, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCsube -1, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube -1, i64:$RA))]>; defm SUBFZE8 : XOForm_3rc<31, 200, 0, (outs g8rc:$RT), (ins g8rc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i64:$RT, (PPCsube 0, i64:$RA, CARRY))]>; + [(set i64:$RT, (sube 0, i64:$RA))]>; } } // isCodeGenOnly diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp index 6e0640fa715ea..3aef6f2c893fa 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1758,23 +1758,6 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB, BuildMI(MBB, I, DL, get(PPC::EFDCFS), DestReg).addReg(SrcReg); getKillRegState(KillSrc); return; - } else if ((PPC::G8RCRegClass.contains(DestReg) || - PPC::GPRCRegClass.contains(DestReg)) && - SrcReg == PPC::CARRY) { - bool Is64Bit = PPC::G8RCRegClass.contains(DestReg); - BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MFSPR8 : PPC::MFSPR), DestReg) - .addImm(1) - .addReg(PPC::CARRY, RegState::Implicit); - return; - } else if ((PPC::G8RCRegClass.contains(SrcReg) || - PPC::GPRCRegClass.contains(SrcReg)) && - DestReg == PPC::CARRY) { - bool Is64Bit = PPC::G8RCRegClass.contains(SrcReg); - BuildMI(MBB, I, DL, get(Is64Bit ? PPC::MTSPR8 : PPC::MTSPR)) - .addImm(1) - .addReg(SrcReg) - .addReg(PPC::CARRY, RegState::ImplicitDefine); - return; } unsigned Opc; diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td index b5ed5d55da4c7..be90a5c562c57 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td +++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td @@ -124,21 +124,6 @@ def SDT_PPCFPMinMax : SDTypeProfile<1, 2, [ SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0> ]>; -// RES, CARRY = op LHS, RHS -def SDT_PPCBinaryArithWithFlagsOut : SDTypeProfile<2, 2, [ - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, - SDTCisVT<1, i32>, -]>; - -// RES, CARRY = op LHS, RHS, CARRY -def SDT_PPCBinaryArithWithFlagsInOut : SDTypeProfile<2, 3, [ - SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, - SDTCisInt<0>, - SDTCisSameAs<1, 4>, - SDTCisVT<1, i32>, -]>; - //===----------------------------------------------------------------------===// // PowerPC specific DAG Nodes. // @@ -416,15 +401,6 @@ def PPCtlsdynamatpcreladdr : SDNode<"PPCISD::TLS_DYNAMIC_MAT_PCREL_ADDR", def PPCtlslocalexecmataddr : SDNode<"PPCISD::TLS_LOCAL_EXEC_MAT_ADDR", SDTIntUnaryOp, []>; -def PPCaddc : SDNode<"PPCISD::ADDC", SDT_PPCBinaryArithWithFlagsOut, - [SDNPCommutative]>; -def PPCadde : SDNode<"PPCISD::ADDE", SDT_PPCBinaryArithWithFlagsInOut, - []>; -def PPCsubc : SDNode<"PPCISD::SUBC", SDT_PPCBinaryArithWithFlagsOut, - []>; -def PPCsube : SDNode<"PPCISD::SUBE", SDT_PPCBinaryArithWithFlagsInOut, - []>; - //===----------------------------------------------------------------------===// // PowerPC specific transformation functions and pattern fragments. // @@ -2315,7 +2291,7 @@ let BaseName = "addic" in { let Defs = [CARRY] in def ADDIC : DForm_2<12, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "addic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (PPCaddc i32:$RA, imm32SExt16:$D))]>, + [(set i32:$RST, (addc i32:$RA, imm32SExt16:$D))]>, RecFormRel, PPC970_DGroup_Cracked; let Defs = [CARRY, CR0] in def ADDIC_rec : DForm_2<13, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), @@ -2336,7 +2312,7 @@ def MULLI : DForm_2< 7, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), let Defs = [CARRY] in def SUBFIC : DForm_2< 8, (outs gprc:$RST), (ins gprc:$RA, s16imm:$D), "subfic $RST, $RA, $D", IIC_IntGeneral, - [(set i32:$RST, (PPCsubc imm32SExt16:$D, i32:$RA))]>; + [(set i32:$RST, (subc imm32SExt16:$D, i32:$RA))]>; let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in { def LI : DForm_2_r0<14, (outs gprc:$RST), (ins s16imm:$D), @@ -2933,7 +2909,7 @@ def ADD4TLS : XOForm_1<31, 266, 0, (outs gprc:$RT), (ins gprc:$RA, tlsreg32:$RB let isCommutable = 1 in defm ADDC : XOForm_1rc<31, 10, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "addc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCaddc i32:$RA, i32:$RB))]>, + [(set i32:$RT, (addc i32:$RA, i32:$RB))]>, PPC970_DGroup_Cracked; defm DIVW : XOForm_1rcr<31, 491, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), @@ -2966,7 +2942,7 @@ defm SUBF : XOForm_1rx<31, 40, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), [(set i32:$RT, (sub i32:$RB, i32:$RA))]>; defm SUBFC : XOForm_1rc<31, 8, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfc", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCsubc i32:$RB, i32:$RA))]>, + [(set i32:$RT, (subc i32:$RB, i32:$RA))]>, PPC970_DGroup_Cracked; defm NEG : XOForm_3r<31, 104, 0, (outs gprc:$RT), (ins gprc:$RA), "neg", "$RT, $RA", IIC_IntSimple, @@ -2975,22 +2951,22 @@ let Uses = [CARRY] in { let isCommutable = 1 in defm ADDE : XOForm_1rc<31, 138, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "adde", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, i32:$RB, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, i32:$RB))]>; defm ADDME : XOForm_3rc<31, 234, 0, (outs gprc:$RT), (ins gprc:$RA), "addme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, -1, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, -1))]>; defm ADDZE : XOForm_3rc<31, 202, 0, (outs gprc:$RT), (ins gprc:$RA), "addze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCadde i32:$RA, 0, CARRY))]>; + [(set i32:$RT, (adde i32:$RA, 0))]>; defm SUBFE : XOForm_1rc<31, 136, 0, (outs gprc:$RT), (ins gprc:$RA, gprc:$RB), "subfe", "$RT, $RA, $RB", IIC_IntGeneral, - [(set i32:$RT, (PPCsube i32:$RB, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube i32:$RB, i32:$RA))]>; defm SUBFME : XOForm_3rc<31, 232, 0, (outs gprc:$RT), (ins gprc:$RA), "subfme", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCsube -1, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube -1, i32:$RA))]>; defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$RT), (ins gprc:$RA), "subfze", "$RT, $RA", IIC_IntGeneral, - [(set i32:$RT, (PPCsube 0, i32:$RA, CARRY))]>; + [(set i32:$RT, (sube 0, i32:$RA))]>; } } diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp index 2177dba1e5762..b60a91be82406 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -625,13 +625,6 @@ bool PPCRegisterInfo::getRegAllocationHints(Register VirtReg, return BaseImplRetVal; } -const TargetRegisterClass * -PPCRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const { - if (RC == &PPC::CARRYRCRegClass) - return TM.isPPC64() ? &PPC::G8RCRegClass : &PPC::GPRCRegClass; - return RC; -} - unsigned PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const { const PPCFrameLowering *TFI = getFrameLowering(MF); diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h index 21b6f7b13939a..274c7cb68ae0a 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h @@ -76,9 +76,6 @@ class PPCRegisterInfo : public PPCGenRegisterInfo { const TargetRegisterClass * getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override; - const TargetRegisterClass * - getCrossCopyRegClass(const TargetRegisterClass *RC) const override; - unsigned getRegPressureLimit(const TargetRegisterClass *RC, MachineFunction &MF) const override; diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td index 8b690b7b833b3..3cb7cd9d8f229 100644 --- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td +++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td @@ -494,7 +494,6 @@ def LR8RC : RegisterClass<"PPC", [i64], 64, (add LR8)> { def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>; def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY, XER)> { let CopyCost = -1; - let isAllocatable = 0; } // Make AllocationOrder as similar as G8RC's to avoid potential spilling. diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 98c25bc93a8a2..0f5e7bd254f68 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -1549,9 +1549,6 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, if (Subtarget.useRVVForFixedLengthVectors()) setTargetDAGCombine(ISD::BITCAST); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - // Disable strict node mutation. IsStrictFPEnabled = true; EnableExtLdPromotion = true; @@ -19462,6 +19459,11 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known = Known.intersectWith(Known2); break; } + case RISCVISD::VCPOP_VL: { + KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1); + Known.Zero.setBitsFrom(Known2.countMaxActiveBits()); + break; + } case RISCVISD::CZERO_EQZ: case RISCVISD::CZERO_NEZ: Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1); diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index 4fc79b3d6e3f8..da4ef677440fd 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -377,11 +377,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering( setMaxAtomicSizeInBitsSupported(64); - // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is - // consistent with the f64 and f128 names. - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - // Define the emscripten name for return address helper. // TODO: when implementing other Wasm backends, make this generic or only do // this on emscripten depending on what they end up doing. diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index b20a06b238c88..1fe0b1f2e0591 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -537,10 +537,6 @@ struct StaticLibcallNameMap { Map[NameLibcall.first] = NameLibcall.second; } } - // Override the __gnu_f2h_ieee/__gnu_h2f_ieee names so that the f32 name is - // consistent with the f64 and f128 names. - Map["__extendhfsf2"] = RTLIB::FPEXT_F16_F32; - Map["__truncsfhf2"] = RTLIB::FPROUND_F32_F16; Map["emscripten_return_address"] = RTLIB::RETURN_ADDRESS; } diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 696bb14292dd0..d805a76754c71 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -736,9 +736,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f32, Custom); setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Custom); - setLibcallName(RTLIB::FPROUND_F32_F16, "__truncsfhf2"); - setLibcallName(RTLIB::FPEXT_F16_F32, "__extendhfsf2"); - // Lower this to MOVMSK plus an AND. setOperationAction(ISD::FGETSIGN, MVT::i64, Custom); setOperationAction(ISD::FGETSIGN, MVT::i32, Custom); @@ -6113,6 +6110,19 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts, Ops.push_back(N1); return true; } + case ISD::CONCAT_VECTORS: { + // Limit this to vXi64 vector cases to make the most of cross lane shuffles. + unsigned NumSubElts = N.getOperand(0).getValueType().getVectorNumElements(); + if (NumBitsPerElt == 64) { + for (unsigned I = 0, E = N.getNumOperands(); I != E; ++I) { + for (unsigned M = 0; M != NumSubElts; ++M) + Mask.push_back((I * NumElts) + M); + Ops.push_back(N.getOperand(I)); + } + return true; + } + return false; + } case ISD::INSERT_SUBVECTOR: { SDValue Src = N.getOperand(0); SDValue Sub = N.getOperand(1); @@ -38927,13 +38937,17 @@ static bool matchUnaryShuffle(MVT MaskVT, ArrayRef Mask, } // Match against a ANY/SIGN/ZERO_EXTEND_VECTOR_INREG instruction. - // TODO: Add 512-bit vector support (split AVX512F and AVX512BW). - if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || - (MaskVT.is256BitVector() && Subtarget.hasInt256()))) { + if (AllowIntDomain && + ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) || + (MaskVT.is256BitVector() && Subtarget.hasInt256()) || + (MaskVT.is512BitVector() && Subtarget.useAVX512Regs()))) { unsigned MaxScale = 64 / MaskEltSize; bool UseSign = V1.getScalarValueSizeInBits() == MaskEltSize && DAG.ComputeNumSignBits(V1) == MaskEltSize; for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) { + // Skip 512-bit VPMOV?XBW on non-AVX512BW targets. + if (Scale == 2 && MaskVT == MVT::v64i8 && !Subtarget.useBWIRegs()) + continue; bool MatchAny = true; bool MatchZero = true; bool MatchSign = UseSign; @@ -39566,7 +39580,7 @@ static bool matchBinaryPermuteShuffle( static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableCrossLaneMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget); @@ -39581,7 +39595,7 @@ static SDValue combineX86ShuffleChainWithExtract( /// instruction but should only be used to replace chains over a certain depth. static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, @@ -40050,6 +40064,10 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, if (Depth < 1) return SDValue(); + bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) { + return isTargetShuffleVariableMask(N->getOpcode()); + }); + // Depth threshold above which we can efficiently use variable mask shuffles. int VariableCrossLaneShuffleDepth = Subtarget.hasFastVariableCrossLaneShuffle() ? 1 : 2; @@ -40120,9 +40138,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, - AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, - Subtarget)) + Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input lane-crossing shuffle then lower to VPERMV3, @@ -40293,8 +40310,8 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // If that failed and either input is extracted then try to combine as a // shuffle with the larger type. if (SDValue WideShuffle = combineX86ShuffleChainWithExtract( - Inputs, Root, BaseMask, Depth, HasVariableMask, - AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) + Inputs, Root, BaseMask, Depth, SrcNodes, AllowVariableCrossLaneMask, + AllowVariablePerLaneMask, DAG, Subtarget)) return WideShuffle; // If we have a dual input shuffle then lower to VPERMV3, @@ -40332,7 +40349,7 @@ static SDValue combineX86ShuffleChain(ArrayRef Inputs, SDValue Root, // extract_subvector(shuffle(x,y,m2),0) static SDValue combineX86ShuffleChainWithExtract( ArrayRef Inputs, SDValue Root, ArrayRef BaseMask, int Depth, - bool HasVariableMask, bool AllowVariableCrossLaneMask, + ArrayRef SrcNodes, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { unsigned NumMaskElts = BaseMask.size(); @@ -40461,7 +40478,7 @@ static SDValue combineX86ShuffleChainWithExtract( if (SDValue WideShuffle = combineX86ShuffleChain(WideInputs, WideRoot, WideMask, Depth, - HasVariableMask, AllowVariableCrossLaneMask, + SrcNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) { WideShuffle = extractSubVector(WideShuffle, 0, DAG, SDLoc(Root), RootSizeInBits); @@ -40684,7 +40701,7 @@ static SDValue canonicalizeShuffleMaskWithHorizOp( // TODO: Extend this to merge multiple constant Ops and update the mask. static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, ArrayRef Mask, - bool HasVariableMask, + ArrayRef SrcNodes, SelectionDAG &DAG, const SDLoc &DL, const X86Subtarget &Subtarget) { unsigned SizeInBits = VT.getSizeInBits(); @@ -40706,6 +40723,9 @@ static SDValue combineX86ShufflesConstants(MVT VT, ArrayRef Ops, // only used once or the combined shuffle has included a variable mask // shuffle, this is to avoid constant pool bloat. bool IsOptimizingSize = DAG.shouldOptForSize(); + bool HasVariableMask = llvm::any_of(SrcNodes, [](const SDNode *N) { + return isTargetShuffleVariableMask(N->getOpcode()); + }); if (IsOptimizingSize && !HasVariableMask && llvm::none_of(Ops, [](SDValue SrcOp) { return SrcOp->hasOneUse(); })) return SDValue(); @@ -40807,7 +40827,7 @@ namespace llvm { static SDValue combineX86ShufflesRecursively( ArrayRef SrcOps, int SrcOpIndex, SDValue Root, ArrayRef RootMask, ArrayRef SrcNodes, unsigned Depth, - unsigned MaxDepth, bool HasVariableMask, bool AllowVariableCrossLaneMask, + unsigned MaxDepth, bool AllowVariableCrossLaneMask, bool AllowVariablePerLaneMask, SelectionDAG &DAG, const X86Subtarget &Subtarget) { assert(!RootMask.empty() && @@ -40863,7 +40883,6 @@ static SDValue combineX86ShufflesRecursively( SmallVector OpMask; SmallVector OpInputs; APInt OpUndef, OpZero; - bool IsOpVariableMask = isTargetShuffleVariableMask(Op.getOpcode()); if (getTargetShuffleInputs(Op, OpDemandedElts, OpInputs, OpMask, OpUndef, OpZero, DAG, Depth, false)) { // Shuffle inputs must not be larger than the shuffle result. @@ -41078,7 +41097,6 @@ static SDValue combineX86ShufflesRecursively( return getOnesVector(RootVT, DAG, DL); assert(!Ops.empty() && "Shuffle with no inputs detected"); - HasVariableMask |= IsOpVariableMask; // Update the list of shuffle nodes that have been combined so far. SmallVector CombinedNodes(SrcNodes); @@ -41107,15 +41125,14 @@ static SDValue combineX86ShufflesRecursively( } if (SDValue Res = combineX86ShufflesRecursively( Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1, MaxDepth, - HasVariableMask, AllowCrossLaneVar, AllowPerLaneVar, DAG, - Subtarget)) + AllowCrossLaneVar, AllowPerLaneVar, DAG, Subtarget)) return Res; } } // Attempt to constant fold all of the constant source ops. if (SDValue Cst = combineX86ShufflesConstants( - RootVT, Ops, Mask, HasVariableMask, DAG, DL, Subtarget)) + RootVT, Ops, Mask, CombinedNodes, DAG, DL, Subtarget)) return Cst; // If constant fold failed and we only have constants - then we have @@ -41217,7 +41234,7 @@ static SDValue combineX86ShufflesRecursively( // Try to combine into a single shuffle instruction. if (SDValue Shuffle = combineX86ShuffleChain( - Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget)) return Shuffle; @@ -41236,7 +41253,7 @@ static SDValue combineX86ShufflesRecursively( // If that failed and any input is extracted then try to combine as a // shuffle with the larger type. return combineX86ShuffleChainWithExtract( - Ops, Root, Mask, Depth, HasVariableMask, AllowVariableCrossLaneMask, + Ops, Root, Mask, Depth, CombinedNodes, AllowVariableCrossLaneMask, AllowVariablePerLaneMask, DAG, Subtarget); } @@ -41245,7 +41262,6 @@ static SDValue combineX86ShufflesRecursively(SDValue Op, SelectionDAG &DAG, const X86Subtarget &Subtarget) { return combineX86ShufflesRecursively( {Op}, 0, Op, {0}, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget); } @@ -41883,7 +41899,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, if (SDValue Res = combineX86ShufflesRecursively( {BC}, 0, BC, DemandedMask, {}, /*Depth*/ 0, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, + /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, DAG, Subtarget)) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, DAG.getBitcast(SrcVT, Res)); @@ -42222,7 +42238,7 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL, llvm::narrowShuffleMaskElts(EltBits / 8, Mask, ByteMask); if (SDValue NewMask = combineX86ShufflesConstants( ShufVT, {MaskLHS, MaskRHS}, ByteMask, - /*HasVariableMask=*/true, DAG, DL, Subtarget)) { + {LHS.getNode(), RHS.getNode()}, DAG, DL, Subtarget)) { SDValue NewLHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, LHS.getOperand(0), NewMask); SDValue NewRHS = DAG.getNode(X86ISD::PSHUFB, DL, ShufVT, @@ -43857,7 +43873,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( SDValue NewShuffle = combineX86ShufflesRecursively( {Op}, 0, Op, DemandedMask, {}, 0, X86::MaxShuffleCombineDepth - Depth, - /*HasVarMask*/ false, /*AllowCrossLaneVarMask*/ true, /*AllowPerLaneVarMask*/ true, TLO.DAG, Subtarget); if (NewShuffle) @@ -51416,7 +51431,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue Shuffle = combineX86ShufflesRecursively( {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 1, X86::MaxShuffleCombineDepth, - /*HasVarMask*/ false, /*AllowVarCrossLaneMask*/ true, + /*AllowVarCrossLaneMask*/ true, /*AllowVarPerLaneMask*/ true, DAG, Subtarget)) return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Shuffle, N0.getOperand(1)); diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp index c43fd97a055fc..2859195c6c26e 100644 --- a/llvm/lib/Target/X86/X86PadShortFunction.cpp +++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp @@ -163,7 +163,8 @@ void PadShortFunc::findReturns(MachineBasicBlock *MBB, unsigned int Cycles) { return; if (hasReturn) { - ReturnBBs[MBB] = std::max(ReturnBBs[MBB], Cycles); + unsigned int &NumCycles = ReturnBBs[MBB]; + NumCycles = std::max(NumCycles, Cycles); return; } diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp index 0a605dfd017cb..8731a16b88a5c 100644 --- a/llvm/lib/TargetParser/TargetParser.cpp +++ b/llvm/lib/TargetParser/TargetParser.cpp @@ -104,8 +104,6 @@ constexpr GPUInfo AMDGCNGPUs[] = { {{"gfx909"}, {"gfx909"}, GK_GFX909, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, {{"gfx90a"}, {"gfx90a"}, GK_GFX90A, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx90c"}, {"gfx90c"}, GK_GFX90C, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK}, - {{"gfx940"}, {"gfx940"}, GK_GFX940, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, - {{"gfx941"}, {"gfx941"}, GK_GFX941, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx942"}, {"gfx942"}, GK_GFX942, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx950"}, {"gfx950"}, GK_GFX950, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_XNACK|FEATURE_SRAMECC}, {{"gfx1010"}, {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32|FEATURE_XNACK|FEATURE_WGP}, @@ -260,8 +258,6 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) { case GK_GFX909: return {9, 0, 9}; case GK_GFX90A: return {9, 0, 10}; case GK_GFX90C: return {9, 0, 12}; - case GK_GFX940: return {9, 4, 0}; - case GK_GFX941: return {9, 4, 1}; case GK_GFX942: return {9, 4, 2}; case GK_GFX950: return {9, 5, 0}; case GK_GFX1010: return {10, 1, 0}; @@ -506,8 +502,6 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, Features["gfx950-insts"] = true; [[fallthrough]]; case GK_GFX942: - case GK_GFX941: - case GK_GFX940: Features["fp8-insts"] = true; Features["fp8-conversion-insts"] = true; if (Kind != GK_GFX950) diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp index c1dd8bc393f33..17e7fada10827 100644 --- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp +++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp @@ -3970,17 +3970,18 @@ struct AANoAliasCallSiteArgument final : AANoAliasImpl { // TODO: We should track the capturing uses in AANoCapture but the problem // is CGSCC runs. For those we would need to "allow" AANoCapture for // a value in the module slice. - // TODO(captures): Make this more precise. - UseCaptureInfo CI = - DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); - if (capturesNothing(CI)) + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: return true; - if (CI.isPassthrough()) { + case UseCaptureKind::MAY_CAPTURE: + LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI + << "\n"); + return false; + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *UserI << "\n"); - return false; + llvm_unreachable("unknown UseCaptureKind"); }; bool IsKnownNoCapture; @@ -6018,16 +6019,16 @@ ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) { }; auto UseCheck = [&](const Use &U, bool &Follow) -> bool { - // TODO(captures): Make this more precise. - UseCaptureInfo CI = - DetermineUseCaptureKind(U, /*Base=*/nullptr, IsDereferenceableOrNull); - if (capturesNothing(CI)) + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::NO_CAPTURE: return true; - if (CI.isPassthrough()) { + case UseCaptureKind::MAY_CAPTURE: + return checkUse(A, T, U, Follow); + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - return checkUse(A, T, U, Follow); + llvm_unreachable("Unexpected use capture kind!"); }; if (!A.checkForAllUses(UseCheck, *this, *V)) @@ -12150,13 +12151,16 @@ struct AAGlobalValueInfoFloating : public AAGlobalValueInfo { auto UsePred = [&](const Use &U, bool &Follow) -> bool { Uses.insert(&U); - // TODO(captures): Make this more precise. - UseCaptureInfo CI = DetermineUseCaptureKind(U, /*Base=*/nullptr, nullptr); - if (CI.isPassthrough()) { + switch (DetermineUseCaptureKind(U, nullptr)) { + case UseCaptureKind::NO_CAPTURE: + return checkUse(A, U, Follow, Worklist); + case UseCaptureKind::MAY_CAPTURE: + return checkUse(A, U, Follow, Worklist); + case UseCaptureKind::PASSTHROUGH: Follow = true; return true; } - return checkUse(A, U, Follow, Worklist); + return true; }; auto EquivalentUseCB = [&](const Use &OldU, const Use &NewU) { Uses.insert(&OldU); diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp index 02b0fcb3981a7..a63e38a7d98ad 100644 --- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp +++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp @@ -71,9 +71,7 @@ using namespace llvm; #define DEBUG_TYPE "function-attrs" STATISTIC(NumMemoryAttr, "Number of functions with improved memory attribute"); -STATISTIC(NumCapturesNone, "Number of arguments marked captures(none)"); -STATISTIC(NumCapturesPartial, "Number of arguments marked with captures " - "attribute other than captures(none)"); +STATISTIC(NumNoCapture, "Number of arguments marked nocapture"); STATISTIC(NumReturned, "Number of arguments marked returned"); STATISTIC(NumReadNoneArg, "Number of arguments marked readnone"); STATISTIC(NumReadOnlyArg, "Number of arguments marked readonly"); @@ -110,13 +108,6 @@ static cl::opt DisableThinLTOPropagation( "disable-thinlto-funcattrs", cl::init(true), cl::Hidden, cl::desc("Don't propagate function-attrs in thinLTO")); -static void addCapturesStat(CaptureInfo CI) { - if (capturesNothing(CI)) - ++NumCapturesNone; - else - ++NumCapturesPartial; -} - namespace { using SCCNodeSet = SmallSetVector; @@ -507,9 +498,6 @@ namespace { /// SCC of the arguments. struct ArgumentGraphNode { Argument *Definition; - /// CaptureComponents for this argument, excluding captures via Uses. - /// We don't distinguish between other/return captures here. - CaptureComponents CC = CaptureComponents::None; SmallVector Uses; }; @@ -551,36 +539,18 @@ class ArgumentGraph { struct ArgumentUsesTracker : public CaptureTracker { ArgumentUsesTracker(const SCCNodeSet &SCCNodes) : SCCNodes(SCCNodes) {} - void tooManyUses() override { CI = CaptureInfo::all(); } - - Action captured(const Use *U, UseCaptureInfo UseCI) override { - if (updateCaptureInfo(U, UseCI.UseCC)) { - // Don't bother continuing if we already capture everything. - if (capturesAll(CI.getOtherComponents())) - return Stop; - return Continue; - } - - // For SCC argument tracking, we're not going to analyze other/ret - // components separately, so don't follow the return value. - return ContinueIgnoringReturn; - } + void tooManyUses() override { Captured = true; } - bool updateCaptureInfo(const Use *U, CaptureComponents CC) { + bool captured(const Use *U) override { CallBase *CB = dyn_cast(U->getUser()); if (!CB) { - if (isa(U->getUser())) - CI |= CaptureInfo::retOnly(CC); - else - // Conservatively assume that the captured value might make its way - // into the return value as well. This could be made more precise. - CI |= CaptureInfo(CC); + Captured = true; return true; } Function *F = CB->getCalledFunction(); if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) { - CI |= CaptureInfo(CC); + Captured = true; return true; } @@ -594,24 +564,22 @@ struct ArgumentUsesTracker : public CaptureTracker { // use. In this case it does not matter if the callee is within our SCC // or not -- we've been captured in some unknown way, and we have to be // conservative. - CI |= CaptureInfo(CC); + Captured = true; return true; } if (UseIndex >= F->arg_size()) { assert(F->isVarArg() && "More params than args in non-varargs call"); - CI |= CaptureInfo(CC); + Captured = true; return true; } - // TODO(captures): Could improve precision by remembering maximum - // capture components for the argument. Uses.push_back(&*std::next(F->arg_begin(), UseIndex)); return false; } - // Does not include potential captures via Uses in the SCC. - CaptureInfo CI = CaptureInfo::none(); + // True only if certainly captured (used outside our SCC). + bool Captured = false; // Uses within our SCC. SmallVector Uses; @@ -1226,15 +1194,6 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, bool SkipInitializes) { ArgumentGraph AG; - auto DetermineAccessAttrsForSingleton = [](Argument *A) { - SmallPtrSet Self; - Self.insert(A); - Attribute::AttrKind R = determinePointerAccessAttrs(A, Self); - if (R != Attribute::None) - return addAccessAttr(A, R); - return false; - }; - // Check each function in turn, determining which pointer arguments are not // captured. for (Function *F : SCCNodes) { @@ -1255,7 +1214,7 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (A.getType()->isPointerTy() && !A.hasNoCaptureAttr()) { A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), CaptureInfo::none())); - ++NumCapturesNone; + ++NumNoCapture; Changed.insert(F); } } @@ -1266,23 +1225,21 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (!A.getType()->isPointerTy()) continue; bool HasNonLocalUses = false; - CaptureInfo OrigCI = A.getAttributes().getCaptureInfo(); - if (!capturesNothing(OrigCI)) { + if (!A.hasNoCaptureAttr()) { ArgumentUsesTracker Tracker(SCCNodes); PointerMayBeCaptured(&A, &Tracker); - CaptureInfo NewCI = Tracker.CI & OrigCI; - if (NewCI != OrigCI) { + if (!Tracker.Captured) { if (Tracker.Uses.empty()) { - // If the information is complete, add the attribute now. - A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), NewCI)); - addCapturesStat(NewCI); + // If it's trivially not captured, mark it nocapture now. + A.addAttr(Attribute::getWithCaptureInfo(A.getContext(), + CaptureInfo::none())); + ++NumNoCapture; Changed.insert(F); } else { // If it's not trivially captured and not trivially not captured, // then it must be calling into another function in our SCC. Save // its particulars for Argument-SCC analysis later. ArgumentGraphNode *Node = AG[&A]; - Node->CC = CaptureComponents(NewCI); for (Argument *Use : Tracker.Uses) { Node->Uses.push_back(AG[Use]); if (Use != &A) @@ -1297,8 +1254,12 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, // an SCC? Note that we don't allow any calls at all here, or else our // result will be dependent on the iteration order through the // functions in the SCC. - if (DetermineAccessAttrsForSingleton(&A)) - Changed.insert(F); + SmallPtrSet Self; + Self.insert(&A); + Attribute::AttrKind R = determinePointerAccessAttrs(&A, Self); + if (R != Attribute::None) + if (addAccessAttr(&A, R)) + Changed.insert(F); } if (!SkipInitializes && !A.onlyReadsMemory()) { if (inferInitializes(A, *F)) @@ -1324,17 +1285,17 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, if (ArgumentSCC[0]->Uses.size() == 1 && ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) { Argument *A = ArgumentSCC[0]->Definition; - CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); - CaptureInfo NewCI = CaptureInfo(ArgumentSCC[0]->CC) & OrigCI; - if (NewCI != OrigCI) { - A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); - addCapturesStat(NewCI); - Changed.insert(A->getParent()); - } - - // Infer the access attributes given the new captures one - if (DetermineAccessAttrsForSingleton(A)) - Changed.insert(A->getParent()); + A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), + CaptureInfo::none())); + ++NumNoCapture; + Changed.insert(A->getParent()); + + // Infer the access attributes given the new nocapture one + SmallPtrSet Self; + Self.insert(&*A); + Attribute::AttrKind R = determinePointerAccessAttrs(&*A, Self); + if (R != Attribute::None) + addAccessAttr(A, R); } continue; } @@ -1346,45 +1307,27 @@ static void addArgumentAttrs(const SCCNodeSet &SCCNodes, ArgumentSCCNodes.insert(I->Definition); } - // At the SCC level, only track merged CaptureComponents. We're not - // currently prepared to handle propagation of return-only captures across - // the SCC. - CaptureComponents CC = CaptureComponents::None; + bool SCCCaptured = false; for (ArgumentGraphNode *N : ArgumentSCC) { for (ArgumentGraphNode *Use : N->Uses) { Argument *A = Use->Definition; - if (ArgumentSCCNodes.count(A)) - CC |= Use->CC; - else - CC |= CaptureComponents(A->getAttributes().getCaptureInfo()); + if (A->hasNoCaptureAttr() || ArgumentSCCNodes.count(A)) + continue; + SCCCaptured = true; break; } - if (capturesAll(CC)) + if (SCCCaptured) break; } - - if (!capturesAll(CC)) { - for (ArgumentGraphNode *N : ArgumentSCC) { - Argument *A = N->Definition; - CaptureInfo OrigCI = A->getAttributes().getCaptureInfo(); - CaptureInfo NewCI = CaptureInfo(N->CC | CC) & OrigCI; - if (NewCI != OrigCI) { - A->addAttr(Attribute::getWithCaptureInfo(A->getContext(), NewCI)); - addCapturesStat(NewCI); - Changed.insert(A->getParent()); - } - } - } - - // TODO(captures): Ignore address-only captures. - if (capturesAnything(CC)) { - // As the pointer may be captured, determine the pointer attributes - // looking at each argument invidivually. - for (ArgumentGraphNode *N : ArgumentSCC) { - if (DetermineAccessAttrsForSingleton(N->Definition)) - Changed.insert(N->Definition->getParent()); - } + if (SCCCaptured) continue; + + for (ArgumentGraphNode *N : ArgumentSCC) { + Argument *A = N->Definition; + A->addAttr( + Attribute::getWithCaptureInfo(A->getContext(), CaptureInfo::none())); + ++NumNoCapture; + Changed.insert(A->getParent()); } // We also want to compute readonly/readnone/writeonly. With a small number diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp index 76020d2b1dbf4..00a8117f32e70 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -882,8 +882,7 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { void tooManyUses() override { Captured = true; } - Action captured(const Use *U, UseCaptureInfo CI) override { - // TODO(captures): Use UseCaptureInfo. + bool captured(const Use *U) override { auto *ICmp = dyn_cast(U->getUser()); // We need to check that U is based *only* on the alloca, and doesn't // have other contributions from a select/phi operand. @@ -893,11 +892,11 @@ bool InstCombinerImpl::foldAllocaCmp(AllocaInst *Alloca) { // Collect equality icmps of the alloca, and don't treat them as // captures. ICmps[ICmp] |= 1u << U->getOperandNo(); - return Continue; + return false; } Captured = true; - return Stop; + return true; } }; diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index cf38fc5f058f2..e621a0b7fe596 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -205,11 +205,15 @@ static Value *foldSelectICmpAnd(SelectInst &Sel, ICmpInst *Cmp, unsigned ValZeros = ValC.logBase2(); unsigned AndZeros = AndMask.logBase2(); bool ShouldNotVal = !TC.isZero(); + bool NeedShift = ValZeros != AndZeros; + bool NeedZExtTrunc = + SelType->getScalarSizeInBits() != V->getType()->getScalarSizeInBits(); - // If we would need to create an 'and' + 'shift' + 'xor' to replace a 'select' - // + 'icmp', then this transformation would result in more instructions and - // potentially interfere with other folding. - if (CreateAnd && ShouldNotVal && ValZeros != AndZeros) + // If we would need to create an 'and' + 'shift' + 'xor' + cast to replace + // a 'select' + 'icmp', then this transformation would result in more + // instructions and potentially interfere with other folding. + if (CreateAnd + ShouldNotVal + NeedShift + NeedZExtTrunc > + 1 + Cmp->hasOneUse()) return nullptr; // Insert the 'and' instruction on the input to the truncate. @@ -742,39 +746,47 @@ static Value *foldSelectICmpLshrAshr(const ICmpInst *IC, Value *TrueVal, /// 1. The icmp predicate is inverted /// 2. The select operands are reversed /// 3. The magnitude of C2 and C1 are flipped -static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, - Value *FalseVal, - InstCombiner::BuilderTy &Builder) { +static Value *foldSelectICmpAndBinOp(Value *CondVal, Value *TrueVal, + Value *FalseVal, + InstCombiner::BuilderTy &Builder) { // Only handle integer compares. Also, if this is a vector select, we need a // vector compare. if (!TrueVal->getType()->isIntOrIntVectorTy() || - TrueVal->getType()->isVectorTy() != IC->getType()->isVectorTy()) + TrueVal->getType()->isVectorTy() != CondVal->getType()->isVectorTy()) return nullptr; - Value *CmpLHS = IC->getOperand(0); - Value *CmpRHS = IC->getOperand(1); - unsigned C1Log; bool NeedAnd = false; - CmpInst::Predicate Pred = IC->getPredicate(); - if (IC->isEquality()) { - if (!match(CmpRHS, m_Zero())) - return nullptr; + CmpPredicate Pred; + Value *CmpLHS, *CmpRHS; - const APInt *C1; - if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) - return nullptr; + if (match(CondVal, m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)))) { + if (ICmpInst::isEquality(Pred)) { + if (!match(CmpRHS, m_Zero())) + return nullptr; - C1Log = C1->logBase2(); - } else { - auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred); - if (!Res || !Res->Mask.isPowerOf2()) - return nullptr; + const APInt *C1; + if (!match(CmpLHS, m_And(m_Value(), m_Power2(C1)))) + return nullptr; - CmpLHS = Res->X; - Pred = Res->Pred; - C1Log = Res->Mask.logBase2(); - NeedAnd = true; + C1Log = C1->logBase2(); + } else { + auto Res = decomposeBitTestICmp(CmpLHS, CmpRHS, Pred); + if (!Res || !Res->Mask.isPowerOf2()) + return nullptr; + + CmpLHS = Res->X; + Pred = Res->Pred; + C1Log = Res->Mask.logBase2(); + NeedAnd = true; + } + } else if (auto *Trunc = dyn_cast(CondVal)) { + CmpLHS = Trunc->getOperand(0); + C1Log = 0; + Pred = ICmpInst::ICMP_NE; + NeedAnd = !Trunc->hasNoUnsignedWrap(); + } else { + return nullptr; } Value *Y, *V = CmpLHS; @@ -808,7 +820,7 @@ static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, // Make sure we don't create more instructions than we save. if ((NeedShift + NeedXor + NeedZExtTrunc + NeedAnd) > - (IC->hasOneUse() + BinOp->hasOneUse())) + (CondVal->hasOneUse() + BinOp->hasOneUse())) return nullptr; if (NeedAnd) { @@ -1986,9 +1998,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, if (Instruction *V = foldSelectZeroOrOnes(ICI, TrueVal, FalseVal, Builder)) return V; - if (Value *V = foldSelectICmpAndBinOp(ICI, TrueVal, FalseVal, Builder)) - return replaceInstUsesWith(SI, V); - if (Value *V = foldSelectICmpLshrAshr(ICI, TrueVal, FalseVal, Builder)) return replaceInstUsesWith(SI, V); @@ -3946,6 +3955,9 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) { if (Instruction *Result = foldSelectInstWithICmp(SI, ICI)) return Result; + if (Value *V = foldSelectICmpAndBinOp(CondVal, TrueVal, FalseVal, Builder)) + return replaceInstUsesWith(SI, V); + if (Instruction *Add = foldAddSubSelect(SI, Builder)) return Add; if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder)) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 9a729b7afb8b9..87b27beb01a0a 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1550,33 +1550,32 @@ bool MemCpyOptPass::performStackMoveOptzn(Instruction *Load, Instruction *Store, } if (!Visited.insert(&U).second) continue; - UseCaptureInfo CI = - DetermineUseCaptureKind(U, AI, IsDereferenceableOrNull); - // TODO(captures): Make this more precise. - if (CI.isPassthrough()) { + switch (DetermineUseCaptureKind(U, IsDereferenceableOrNull)) { + case UseCaptureKind::MAY_CAPTURE: + return false; + case UseCaptureKind::PASSTHROUGH: + // Instructions cannot have non-instruction users. Worklist.push_back(UI); continue; - } - - if (capturesAnything(CI)) - return false; - - if (UI->isLifetimeStartOrEnd()) { - // We note the locations of these intrinsic calls so that we can - // delete them later if the optimization succeeds, this is safe - // since both llvm.lifetime.start and llvm.lifetime.end intrinsics - // practically fill all the bytes of the alloca with an undefined - // value, although conceptually marked as alive/dead. - int64_t Size = cast(UI->getOperand(0))->getSExtValue(); - if (Size < 0 || Size == DestSize) { - LifetimeMarkers.push_back(UI); - continue; + case UseCaptureKind::NO_CAPTURE: { + if (UI->isLifetimeStartOrEnd()) { + // We note the locations of these intrinsic calls so that we can + // delete them later if the optimization succeeds, this is safe + // since both llvm.lifetime.start and llvm.lifetime.end intrinsics + // practically fill all the bytes of the alloca with an undefined + // value, although conceptually marked as alive/dead. + int64_t Size = cast(UI->getOperand(0))->getSExtValue(); + if (Size < 0 || Size == DestSize) { + LifetimeMarkers.push_back(UI); + continue; + } } + if (UI->hasMetadata(LLVMContext::MD_noalias)) + NoAliasInstrs.insert(UI); + if (!ModRefCallback(UI)) + return false; + } } - if (UI->hasMetadata(LLVMContext::MD_noalias)) - NoAliasInstrs.insert(UI); - if (!ModRefCallback(UI)) - return false; } } return true; diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp index 22c2f91ff55f6..cf0ba6fa54700 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp @@ -366,24 +366,6 @@ void PlainCFGBuilder::buildPlainCFG( // latter. BB2VPBB[ThePreheaderBB] = VectorPreheaderVPBB; Loop2Region[LI->getLoopFor(TheLoop->getHeader())] = TheRegion; - BasicBlock *ExitBB = TheLoop->getUniqueExitBlock(); - if (!ExitBB) { - // If there is no unique exit block, we must exit via the latch. This exit - // is mapped to the middle block in the input plan. - BasicBlock *Latch = TheLoop->getLoopLatch(); - auto *Br = cast(Latch->getTerminator()); - if (TheLoop->contains(Br->getSuccessor(0))) { - assert(!TheLoop->contains(Br->getSuccessor(1)) && - "latch must exit the loop"); - ExitBB = Br->getSuccessor(1); - } else { - assert(!TheLoop->contains(Br->getSuccessor(0)) && - "latch must exit the loop"); - ExitBB = Br->getSuccessor(0); - } - } - assert(ExitBB && "Must have a unique exit block or also exit via the latch."); - BB2VPBB[ExitBB] = cast(TheRegion->getSingleSuccessor()); // The existing vector region's entry and exiting VPBBs correspond to the loop // header and latch. diff --git a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll index cef11b94f3873..872715f31011d 100644 --- a/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll +++ b/llvm/test/Analysis/BasicAA/escape-source-aggregate.ll @@ -2,8 +2,9 @@ declare { ptr, i1 } @get_struct() declare <2 x ptr> @get_vec() +declare void @escape(ptr) -; CHECK: MayAlias: i32* %a, i32* %extract +; CHECK: NoAlias: i32* %a, i32* %extract define i32 @test_extractvalue() { %a = alloca i32 %call = call { ptr, i1 } @get_struct() @@ -13,7 +14,7 @@ define i32 @test_extractvalue() { ret i32 %v } -; CHECK: MayAlias: i32* %a, i32* %extract +; CHECK: NoAlias: i32* %a, i32* %extract define i32 @test_extractelement() { %a = alloca i32 %call = call <2 x ptr> @get_vec() @@ -22,3 +23,25 @@ define i32 @test_extractelement() { %v = load i32, ptr %a ret i32 %v } + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractvalue_escape() { + %a = alloca i32 + call void @escape(ptr %a) + %call = call { ptr, i1 } @get_struct() + %extract = extractvalue { ptr, i1 } %call, 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} + +; CHECK: MayAlias: i32* %a, i32* %extract +define i32 @test_extractelement_escape() { + %a = alloca i32 + call void @escape(ptr %a) + %call = call <2 x ptr> @get_vec() + %extract = extractelement <2 x ptr> %call, i32 0 + store i32 0, ptr %extract + %v = load i32, ptr %a + ret i32 %v +} diff --git a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll index 5e9dc7f2b91cc..38b7389ae9083 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll @@ -83,10 +83,52 @@ exit: define void @dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs(ptr %a, ptr %b, ptr %c, i64 %offset, i64 %n) { ; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs' ; CHECK-NEXT: loop: -; CHECK-NEXT: Report: cannot check memory dependencies at runtime +; CHECK-NEXT: Memory dependences are safe with run-time checks ; CHECK-NEXT: Dependences: ; CHECK-NEXT: Run-time memory checks: +; CHECK-NEXT: Check 0: +; CHECK-NEXT: Comparing group ([[GRP5:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP6:0x[0-9a-f]+]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 1: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP7:0x[0-9a-f]+]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 2: +; CHECK-NEXT: Comparing group ([[GRP5]]): +; CHECK-NEXT: %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv +; CHECK-NEXT: Against group ([[GRP8:0x[0-9a-f]+]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 3: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP7]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Check 4: +; CHECK-NEXT: Comparing group ([[GRP6]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP8]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset +; CHECK-NEXT: Check 5: +; CHECK-NEXT: Comparing group ([[GRP7]]): +; CHECK-NEXT: %select = select i1 %cmp, ptr %gep.b, ptr %gep.c +; CHECK-NEXT: Against group ([[GRP8]]): +; CHECK-NEXT: %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset ; CHECK-NEXT: Grouped accesses: +; CHECK-NEXT: Group [[GRP5]]: +; CHECK-NEXT: (Low: %a High: ((4 * %n) + %a)) +; CHECK-NEXT: Member: {%a,+,4}<%loop> +; CHECK-NEXT: Group [[GRP6]]: +; CHECK-NEXT: (Low: %b High: ((4 * %n) + %b)) +; CHECK-NEXT: Member: {%b,+,4}<%loop> +; CHECK-NEXT: Group [[GRP7]]: +; CHECK-NEXT: (Low: %c High: ((4 * %n) + %c)) +; CHECK-NEXT: Member: {%c,+,4}<%loop> +; CHECK-NEXT: Group [[GRP8]]: +; CHECK-NEXT: (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a)) +; CHECK-NEXT: Member: {((4 * %offset) + %a),+,4}<%loop> ; CHECK-EMPTY: ; CHECK-NEXT: Non vectorizable stores to invariant address were not found in loop. ; CHECK-NEXT: SCEV assumptions: diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll index 72f092adf5054..77b78fb4bd4f8 100644 --- a/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll +++ b/llvm/test/Bitcode/constexpr-to-instr-metadata-2.ll @@ -1,7 +1,7 @@ ; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata-2.bc | FileCheck %s ; CHECK-LABEL: define void @_ZN4alsa3pcm3PCM17hw_params_current17hf1c237aece2f69c4E() { -; CHECK: #dbg_value(ptr undef, !4, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !14 +; CHECK: #dbg_value(ptr poison, !4, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !14 ; CHECK-LABEL: define void @_ZN4alsa3pcm8HwParams3any17h02a64cfc85ce8a66E() { -; CHECK: #dbg_value(ptr undef, !23, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !28 +; CHECK: #dbg_value(ptr poison, !23, !DIExpression(DW_OP_LLVM_fragment, 0, 64), !28 diff --git a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll index ecc39a86c6327..84b1a8f5ba45d 100644 --- a/llvm/test/Bitcode/constexpr-to-instr-metadata.ll +++ b/llvm/test/Bitcode/constexpr-to-instr-metadata.ll @@ -1,4 +1,4 @@ ; RUN: llvm-dis -expand-constant-exprs < %S/Inputs/constexpr-to-instr-metadata.bc | FileCheck %s ; CHECK-LABEL: define void @test() { -; CHECK: #dbg_value(i64 undef, !4, !DIExpression(DW_OP_LLVM_fragment, 64, 64), !13 +; CHECK: #dbg_value(i64 poison, !4, !DIExpression(DW_OP_LLVM_fragment, 64, 64), !13 diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll index bfe9ab8424bb0..0bd7c1b10b123 100644 --- a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll +++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll @@ -7,7 +7,7 @@ define half @f2h(float %a) { ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll index 0c3a40d93d640..21729b9dfd101 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll @@ -60,13 +60,13 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -148,13 +148,13 @@ define half @test_atomicrmw_fadd_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -712,22 +712,22 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __addsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll index 24088998f36d1..9b5e48d2b4217 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmax.ll @@ -62,13 +62,13 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -150,13 +150,13 @@ define half @test_atomicrmw_fmax_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -592,22 +592,22 @@ define <2 x half> @test_atomicrmw_fmax_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fmaxf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll index 65f1f4863c173..f6c542fe7d407 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fmin.ll @@ -62,13 +62,13 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -150,13 +150,13 @@ define half @test_atomicrmw_fmin_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -592,22 +592,22 @@ define <2 x half> @test_atomicrmw_fmin_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB6_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl fminf -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll index 0f1a2f03c98c3..82e0f14e68e26 100644 --- a/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll +++ b/llvm/test/CodeGen/AArch64/atomicrmw-fsub.ll @@ -60,13 +60,13 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align2(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB0_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB0_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB0_2 Depth=1 @@ -148,13 +148,13 @@ define half @test_atomicrmw_fsub_f16_seq_cst_align4(ptr %ptr, half %value) #0 { ; SOFTFP-NOLSE-NEXT: // Child Loop BB1_3 Depth 2 ; SOFTFP-NOLSE-NEXT: mov w22, w0 ; SOFTFP-NOLSE-NEXT: and w0, w20, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w21, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w21 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w0 ; SOFTFP-NOLSE-NEXT: .LBB1_3: // %cmpxchg.start ; SOFTFP-NOLSE-NEXT: // Parent Loop BB1_2 Depth=1 @@ -712,22 +712,22 @@ define <2 x half> @test_atomicrmw_fsub_v2f16_seq_cst_align4(ptr %ptr, <2 x half> ; SOFTFP-NOLSE-NEXT: // =>This Loop Header: Depth=1 ; SOFTFP-NOLSE-NEXT: // Child Loop BB7_3 Depth 2 ; SOFTFP-NOLSE-NEXT: and w0, w19, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w23, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w24 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w24, w0 ; SOFTFP-NOLSE-NEXT: and w0, w21, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w25, w0 ; SOFTFP-NOLSE-NEXT: and w0, w22, #0xffff -; SOFTFP-NOLSE-NEXT: bl __gnu_h2f_ieee +; SOFTFP-NOLSE-NEXT: bl __extendhfsf2 ; SOFTFP-NOLSE-NEXT: mov w1, w25 ; SOFTFP-NOLSE-NEXT: bl __subsf3 -; SOFTFP-NOLSE-NEXT: bl __gnu_f2h_ieee +; SOFTFP-NOLSE-NEXT: bl __truncsfhf2 ; SOFTFP-NOLSE-NEXT: mov w8, w22 ; SOFTFP-NOLSE-NEXT: bfi w0, w24, #16, #16 ; SOFTFP-NOLSE-NEXT: bfi w8, w23, #16, #16 diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll index e9722f348f411..363f0a0598e23 100644 --- a/llvm/test/CodeGen/AArch64/cpus.ll +++ b/llvm/test/CodeGen/AArch64/cpus.ll @@ -18,6 +18,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a77 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-a78 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=cortex-x1 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=grace 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-e1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-n1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=neoverse-n2 2>&1 | FileCheck %s diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll index 3db802a2bc355..63b8a1cee27ae 100644 --- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll +++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll @@ -22,7 +22,7 @@ define void @f16_arg(half %arg, ptr %ptr) #0 { ; NOFP16-NEXT: .cfi_offset w30, -16 ; NOFP16-NEXT: and w0, w0, #0xffff ; NOFP16-NEXT: mov x19, x1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: str w0, [x19] ; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload ; NOFP16-NEXT: ret @@ -44,10 +44,10 @@ define void @v2f16_arg(<2 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: and w0, w0, #0xffff ; NOFP16-NEXT: mov x19, x2 ; NOFP16-NEXT: mov w20, w1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: and w0, w20, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: stp w21, w0, [x19] ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload @@ -73,14 +73,14 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: and w0, w1, #0xffff ; NOFP16-NEXT: mov x19, x3 ; NOFP16-NEXT: mov w20, w2 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w8, w0 ; NOFP16-NEXT: and w0, w20, #0xffff ; NOFP16-NEXT: orr x21, x8, x22, lsl #32 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: str x21, [x19] ; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: str w0, [x19, #8] @@ -110,16 +110,16 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w20, w3 ; NOFP16-NEXT: mov w21, w2 ; NOFP16-NEXT: mov w22, w1 -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w23, w0 ; NOFP16-NEXT: and w0, w22, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: and w0, w21, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: and w0, w20, #0xffff -; NOFP16-NEXT: bl __gnu_h2f_ieee +; NOFP16-NEXT: bl __extendhfsf2 ; NOFP16-NEXT: stp w21, w0, [x19, #8] ; NOFP16-NEXT: stp w23, w22, [x19] ; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload @@ -137,7 +137,7 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; NOFP16-NEXT: .cfi_def_cfa_offset 16 ; NOFP16-NEXT: .cfi_offset w30, -16 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; NOFP16-NEXT: ret %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict") @@ -155,10 +155,10 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: .cfi_offset w30, -32 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w20 ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload @@ -180,13 +180,13 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w2 ; NOFP16-NEXT: mov w19, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w21, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w19 ; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload ; NOFP16-NEXT: mov w2, w21 @@ -212,16 +212,16 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 { ; NOFP16-NEXT: mov w0, w3 ; NOFP16-NEXT: mov w19, w2 ; NOFP16-NEXT: mov w20, w1 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w22, w0 ; NOFP16-NEXT: mov w0, w19 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w19, w0 ; NOFP16-NEXT: mov w0, w20 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w20, w0 ; NOFP16-NEXT: mov w0, w21 -; NOFP16-NEXT: bl __gnu_f2h_ieee +; NOFP16-NEXT: bl __truncsfhf2 ; NOFP16-NEXT: mov w1, w20 ; NOFP16-NEXT: mov w2, w19 ; NOFP16-NEXT: mov w3, w22 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll index 99e6c5d06a0e1..0b09cabf25a16 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll @@ -3,7 +3,8 @@ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefix=GFX8 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefix=GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -amdgpu-enable-delay-alu=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s define float @v_fma_f32(float %x, float %y, float %z) { ; GFX6-LABEL: v_fma_f32: @@ -107,11 +108,18 @@ define half @v_fma_f16(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %fma = call half @llvm.fma.f16(half %x, half %y, half %z) ret half %fma } @@ -145,11 +153,17 @@ define half @v_fma_f16_fneg_lhs(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, -v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_lhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, -v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_lhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, -v0.l, v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_lhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, -v0, v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.x = fneg half %x %fma = call half @llvm.fma.f16(half %neg.x, half %y, half %z) ret half %fma @@ -184,11 +198,17 @@ define half @v_fma_f16_fneg_rhs(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, -v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_rhs: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, -v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_rhs: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, -v1.l, v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_rhs: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, -v1, v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.y = fneg half %y %fma = call half @llvm.fma.f16(half %x, half %neg.y, half %z) ret half %fma @@ -223,11 +243,17 @@ define half @v_fma_f16_fneg_add(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, -v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: v_fma_f16_fneg_add: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, -v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-TRUE16-LABEL: v_fma_f16_fneg_add: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v1.l, -v2.l +; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-FAKE16-LABEL: v_fma_f16_fneg_add: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_fma_f16 v0, v0, v1, -v2 +; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %neg.z = fneg half %z %fma = call half @llvm.fma.f16(half %x, half %y, half %neg.z) ret half %fma diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index ac7944f25fe37..23e4b80b61f69 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -1,5 +1,6 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s +# FIXME-TRUE16. reenable after fix-sgpr-copies is fixed for true16 flow +# XUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s --- diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll index 52a23690dcf53..a33fd03e0ce03 100644 --- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll @@ -3,8 +3,10 @@ ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX9,GFX9-GISEL ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX10,GFX10-GISEL -; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-SDAG -; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11,GFX11-GISEL +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-TRUE16 +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-SDAG-FAKE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-TRUE16 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX11-GISEL-FAKE16 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-SDAG ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GFX12,GFX12-GISEL @@ -24,11 +26,34 @@ define half @test_fma(half %x, half %y, half %z) { ; GFX10-NEXT: v_fma_f16 v0, v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fma: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fma_f16 v0, v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fma: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fma: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fma: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fma: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fma_f16 v0, v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fma: ; GFX12: ; %bb.0: @@ -57,11 +82,31 @@ define half @test_fmac(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmac_f16_e32 v0, v1, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmac: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmac_f16_e32 v0, v1, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmac: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v1.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmac: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmac: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.l, v1.l, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmac: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v0, v1, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmac: ; GFX12: ; %bb.0: @@ -98,11 +143,31 @@ define half @test_fmaak(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmaak: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmaak: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v0.h, 0x4200 +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmaak: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmaak: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmaak_f16 v0.l, v0.l, v1.l, 0x4200 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmaak: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmaak_f16 v0, v0, v1, 0x4200 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmaak: ; GFX12: ; %bb.0: @@ -139,11 +204,33 @@ define half @test_fmamk(half %x, half %y, half %z) { ; GFX10-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-LABEL: test_fmamk: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_fmamk: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmamk_f16 v0.l, v0.l, 0x4200, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_fmamk: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_fmamk: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v2.l, 0x4200, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, v2 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_fmamk: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_fmamk_f16 v0, v0, 0x4200, v2 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-LABEL: test_fmamk: ; GFX12: ; %bb.0: @@ -208,33 +295,61 @@ define i32 @test_D139469_f16(half %arg) { ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_D139469_f16: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-SDAG-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-SDAG-NEXT: v_min_f16_e32 v0, v2, v1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_D139469_f16: -; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e -; GFX11-GISEL-NEXT: v_mul_f16_e32 v2, 0x291e, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 -; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_D139469_f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e +; GFX11-SDAG-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_min_f16_e32 v0.l, v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_D139469_f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX11-SDAG-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX11-SDAG-FAKE16-NEXT: v_min_f16_e32 v0, v2, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_D139469_f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b16_e32 v0.h, 0x211e +; GFX11-GISEL-TRUE16-NEXT: v_mul_f16_e32 v1.l, 0x291e, v0.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_fmac_f16_e32 v0.h, 0x291e, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.h +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_D139469_f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e +; GFX11-GISEL-FAKE16-NEXT: v_mul_f16_e32 v2, 0x291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_fmac_f16_e32 v1, 0x291e, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_D139469_f16: ; GFX12-SDAG: ; %bb.0: ; %bb @@ -347,44 +462,83 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) { ; GFX10-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 ; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-SDAG-LABEL: test_D139469_v2f16: -; GFX11-SDAG: ; %bb.0: ; %bb -; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-NEXT: s_movk_i32 s0, 0x211e -; GFX11-SDAG-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] -; GFX11-SDAG-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-NEXT: v_pk_min_f16 v0, v1, v0 -; GFX11-SDAG-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SDAG-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 -; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-GISEL-LABEL: test_D139469_v2f16: -; GFX11-GISEL: ; %bb.0: ; %bb -; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0x211e211e -; GFX11-GISEL-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v1, 16, v2 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-GISEL-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 -; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) -; GFX11-GISEL-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 -; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-GISEL-NEXT: s_or_b32 s0, s1, s2 -; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX11-SDAG-TRUE16-LABEL: test_D139469_v2f16: +; GFX11-SDAG-TRUE16: ; %bb.0: ; %bb +; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-TRUE16-NEXT: s_movk_i32 s0, 0x211e +; GFX11-SDAG-TRUE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1.l +; GFX11-SDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-SDAG-FAKE16-LABEL: test_D139469_v2f16: +; GFX11-SDAG-FAKE16: ; %bb.0: ; %bb +; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: s_movk_i32 s0, 0x211e +; GFX11-SDAG-FAKE16-NEXT: v_pk_mul_f16 v1, 0x291e, v0 op_sel_hi:[0,1] +; GFX11-SDAG-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e, v0, s0 op_sel_hi:[0,1,0] +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_pk_min_f16 v0, v1, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v0 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-TRUE16-LABEL: test_D139469_v2f16: +; GFX11-GISEL-TRUE16: ; %bb.0: ; %bb +; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v1, 0x211e211e +; GFX11-GISEL-TRUE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 +; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0.l +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1.l +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-TRUE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3.l +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_or_b32 s0, s1, s2 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-GISEL-TRUE16-NEXT: s_setpc_b64 s[30:31] +; +; GFX11-GISEL-FAKE16-LABEL: test_D139469_v2f16: +; GFX11-GISEL-FAKE16: ; %bb.0: ; %bb +; GFX11-GISEL-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v1, 0x211e211e +; GFX11-GISEL-FAKE16-NEXT: v_pk_mul_f16 v2, 0x291e291e, v0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_pk_fma_f16 v0, 0x291e291e, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v2 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e32 vcc_lo, 0, v2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-GISEL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s0, 0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s1, 0, v1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-GISEL-FAKE16-NEXT: v_cmp_gt_f16_e64 s2, 0, v3 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_or_b32 s0, s1, s2 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-GISEL-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-GISEL-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-LABEL: test_D139469_v2f16: ; GFX12-SDAG: ; %bb.0: ; %bb diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll index b008f397318e8..89c9801b5e466 100644 --- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll +++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info.ll @@ -1,4 +1,4 @@ -; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ +; RUN: opt -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -passes='amdgpu-attributor,function(amdgpu-lower-kernel-arguments)' -amdgpu-kernarg-preload-count=16 -S < %s 2>&1 \ ; RUN: | FileCheck --match-full-lines --implicit-check-not='declare' %s ; Confirms we do not leave behind a declaration which references the same diff --git a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll index 0ad1c30b5b5a4..1f36101c7b53a 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll +++ b/llvm/test/CodeGen/AMDGPU/preserve-hi16.ll @@ -814,7 +814,8 @@ define i32 @zext_fma_f16(half %x, half %y, half %z) { ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l ; GFX11-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-TRUE16-NEXT: v_fma_f16 v0.l, v0.l, v0.h, v1.l +; GFX11-TRUE16-NEXT: v_fmac_f16_e32 v1.l, v0.l, v0.h +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l ; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir new file mode 100644 index 0000000000000..d551ad88f56b7 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-fake16.mir @@ -0,0 +1,242 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 + +--- +name: mad_cvv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_cvv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vcv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vvc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vsc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_cvv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vcv_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vvc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f32 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vsc_f32 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_cvv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_cvv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vcv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vvc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: mad_vsc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_cvv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vcv_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vvc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f16 +body: | + bb.0: + ; GFX11-LABEL: name: fma_vsc_f16 + ; GFX11: $vgpr0 = IMPLICIT_DEF + ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: SI_RETURN implicit $vgpr2 + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_fake16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir new file mode 100644 index 0000000000000..89ef5df9beb8e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma-gfx10.mir @@ -0,0 +1,258 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10 + +--- +name: mad_cvv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_cvv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vcv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vvc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vsc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_cvv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, 1092616192, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vcv_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, 1092616192, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vvc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f32 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vsc_f32 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F32_e64 0, $vgpr0, 0, $vgpr1, 0, 1092616192, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_cvv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_cvv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vcv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vcv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vvc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vvc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: mad_vsc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: mad_vsc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_MAD_F16_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_cvv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_cvv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vcv_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vcv_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vvc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vvc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... + +--- +name: fma_vsc_f16 +body: | + bb.0: + ; GFX10-LABEL: name: fma_vsc_f16 + ; GFX10: $vgpr0 = IMPLICIT_DEF + ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF + ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX10-NEXT: SI_RETURN implicit $vgpr2 + ; + $vgpr0 = IMPLICIT_DEF + $sgpr1 = IMPLICIT_DEF + $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + SI_RETURN implicit $vgpr2 +... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir index 26feb8120c751..c9138dda7d1a7 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir @@ -1,17 +1,10 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX10 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 +# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass si-shrink-instructions -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=GFX11 --- name: mad_cvv_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_cvv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_cvv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -27,12 +20,6 @@ body: | name: mad_vcv_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vcv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vcv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -48,12 +35,6 @@ body: | name: mad_vvc_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vvc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vvc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -69,12 +50,6 @@ body: | name: mad_vsc_f32 body: | bb.0: - ; GFX10-LABEL: name: mad_vsc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vsc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -90,12 +65,6 @@ body: | name: fma_cvv_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_cvv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_cvv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -111,12 +80,6 @@ body: | name: fma_vcv_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vcv_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F32 $vgpr0, 1092616192, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vcv_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -132,12 +95,6 @@ body: | name: fma_vvc_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vvc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vvc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -153,12 +110,6 @@ body: | name: fma_vsc_f32 body: | bb.0: - ; GFX10-LABEL: name: fma_vsc_f32 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F32 $vgpr0, $vgpr1, 1092616192, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vsc_f32 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -174,12 +125,6 @@ body: | name: mad_cvv_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_cvv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_cvv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -195,12 +140,6 @@ body: | name: mad_vcv_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vcv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vcv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -216,12 +155,6 @@ body: | name: mad_vvc_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vvc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vvc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF @@ -237,12 +170,6 @@ body: | name: mad_vsc_f16 body: | bb.0: - ; GFX10-LABEL: name: mad_vsc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_MADAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: mad_vsc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF @@ -258,20 +185,14 @@ body: | name: fma_cvv_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_cvv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_cvv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, 18688, 0, $vgpr0, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, 18688, 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -279,20 +200,14 @@ body: | name: fma_vcv_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vcv_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAMK_F16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vcv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAMK_F16_t16 $vgpr0_lo16, 18688, $vgpr1_lo16, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, 18688, 0, $vgpr1, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, 18688, 0, $vgpr1_lo16, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -300,20 +215,14 @@ body: | name: fma_vvc_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vvc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vvc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_lo16, $vgpr1_lo16, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_lo16, 0, $vgpr1_lo16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... @@ -321,19 +230,13 @@ body: | name: fma_vsc_f16 body: | bb.0: - ; GFX10-LABEL: name: fma_vsc_f16 - ; GFX10: $vgpr0 = IMPLICIT_DEF - ; GFX10-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX10-NEXT: $vgpr2 = V_FMAAK_F16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec - ; GFX10-NEXT: SI_RETURN implicit $vgpr2 - ; ; GFX11-LABEL: name: fma_vsc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2_lo16 = V_FMAAK_F16_t16 $vgpr0_hi16, $vgpr1_hi16, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $sgpr1 = IMPLICIT_DEF - $vgpr2 = V_FMA_F16_gfx9_e64 0, $vgpr0, 0, $vgpr1, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec + $vgpr2_lo16 = V_FMA_F16_gfx9_t16_e64 0, $vgpr0_hi16, 0, $vgpr1_hi16, 0, 18688, 0, 0, 0, implicit $mode, implicit $exec SI_RETURN implicit $vgpr2 ... diff --git a/llvm/test/CodeGen/LoongArch/fp16-promote.ll b/llvm/test/CodeGen/LoongArch/fp16-promote.ll index c5a27a7011278..3701f0df1d2b2 100644 --- a/llvm/test/CodeGen/LoongArch/fp16-promote.ll +++ b/llvm/test/CodeGen/LoongArch/fp16-promote.ll @@ -23,12 +23,12 @@ define float @test_fpextend_float(ptr %p) nounwind { ; LA32-LABEL: test_fpextend_float: ; LA32: # %bb.0: ; LA32-NEXT: ld.hu $a0, $a0, 0 -; LA32-NEXT: b %plt(__gnu_h2f_ieee) +; LA32-NEXT: b %plt(__extendhfsf2) ; ; LA64-LABEL: test_fpextend_float: ; LA64: # %bb.0: ; LA64-NEXT: ld.hu $a0, $a0, 0 -; LA64-NEXT: b %plt(__gnu_h2f_ieee) +; LA64-NEXT: b %plt(__extendhfsf2) %a = load half, ptr %p %r = fpext half %a to float ret float %r @@ -40,7 +40,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: ld.hu $a0, $a0, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fcvt.d.s $fa0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 @@ -51,7 +51,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: ld.hu $a0, $a0, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fcvt.d.s $fa0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 @@ -68,7 +68,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: st.w $fp, $sp, 8 # 4-byte Folded Spill ; LA32-NEXT: move $fp, $a0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -81,7 +81,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: st.d $fp, $sp, 0 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: ld.d $fp, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload @@ -132,12 +132,12 @@ define half @test_fadd_reg(half %a, half %b) nounwind { ; LA32-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: move $a0, $a1 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $fp -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -152,12 +152,12 @@ define half @test_fadd_reg(half %a, half %b) nounwind { ; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: move $a0, $a1 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $fp -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload @@ -178,12 +178,12 @@ define void @test_fadd_mem(ptr %p, ptr %q) nounwind { ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: ld.hu $s0, $a0, 0 ; LA32-NEXT: ld.hu $a0, $a1, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $s0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload @@ -202,12 +202,12 @@ define void @test_fadd_mem(ptr %p, ptr %q) nounwind { ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: ld.hu $s0, $a0, 0 ; LA64-NEXT: ld.hu $a0, $a1, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $s0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload @@ -231,12 +231,12 @@ define half @test_fmul_reg(half %a, half %b) nounwind { ; LA32-NEXT: fst.d $fs0, $sp, 0 # 8-byte Folded Spill ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: move $a0, $a1 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $fp -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA32-NEXT: ld.w $fp, $sp, 8 # 4-byte Folded Reload ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -251,12 +251,12 @@ define half @test_fmul_reg(half %a, half %b) nounwind { ; LA64-NEXT: fst.d $fs0, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: move $a0, $a1 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $fp -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: ld.d $fp, $sp, 16 # 8-byte Folded Reload ; LA64-NEXT: ld.d $ra, $sp, 24 # 8-byte Folded Reload @@ -277,12 +277,12 @@ define void @test_fmul_mem(ptr %p, ptr %q) nounwind { ; LA32-NEXT: move $fp, $a0 ; LA32-NEXT: ld.hu $s0, $a0, 0 ; LA32-NEXT: ld.hu $a0, $a1, 0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmov.s $fs0, $fa0 ; LA32-NEXT: move $a0, $s0 -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: st.h $a0, $fp, 0 ; LA32-NEXT: fld.d $fs0, $sp, 8 # 8-byte Folded Reload ; LA32-NEXT: ld.w $s0, $sp, 20 # 4-byte Folded Reload @@ -301,12 +301,12 @@ define void @test_fmul_mem(ptr %p, ptr %q) nounwind { ; LA64-NEXT: move $fp, $a0 ; LA64-NEXT: ld.hu $s0, $a0, 0 ; LA64-NEXT: ld.hu $a0, $a1, 0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmov.s $fs0, $fa0 ; LA64-NEXT: move $a0, $s0 -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fmul.s $fa0, $fa0, $fs0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: st.h $a0, $fp, 0 ; LA64-NEXT: fld.d $fs0, $sp, 0 # 8-byte Folded Reload ; LA64-NEXT: ld.d $s0, $sp, 8 # 8-byte Folded Reload @@ -327,10 +327,10 @@ define half @freeze_half_undef() nounwind { ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill ; LA32-NEXT: movgr2fr.w $fa0, $zero -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -340,10 +340,10 @@ define half @freeze_half_undef() nounwind { ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill ; LA64-NEXT: movgr2fr.w $fa0, $zero -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 ; LA64-NEXT: ret @@ -357,9 +357,9 @@ define half @freeze_half_poison(half %maybe.poison) nounwind { ; LA32: # %bb.0: ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA32-NEXT: bl %plt(__gnu_f2h_ieee) +; LA32-NEXT: bl %plt(__truncsfhf2) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 ; LA32-NEXT: ret @@ -368,9 +368,9 @@ define half @freeze_half_poison(half %maybe.poison) nounwind { ; LA64: # %bb.0: ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: fadd.s $fa0, $fa0, $fa0 -; LA64-NEXT: bl %plt(__gnu_f2h_ieee) +; LA64-NEXT: bl %plt(__truncsfhf2) ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload ; LA64-NEXT: addi.d $sp, $sp, 16 ; LA64-NEXT: ret @@ -384,7 +384,7 @@ define signext i32 @test_half_to_s32(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: ftintrz.w.s $fa0, $fa0 ; LA32-NEXT: movfr2gr.s $a0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -395,7 +395,7 @@ define signext i32 @test_half_to_s32(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.w.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.s $a0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload @@ -411,7 +411,7 @@ define zeroext i32 @test_half_to_s32_u32(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: ftintrz.w.s $fa0, $fa0 ; LA32-NEXT: movfr2gr.s $a0, $fa0 ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload @@ -422,7 +422,7 @@ define zeroext i32 @test_half_to_s32_u32(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.w.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.s $a0, $fa0 ; LA64-NEXT: bstrpick.d $a0, $a0, 31, 0 @@ -439,7 +439,7 @@ define i64 @test_half_to_i64(half %a) nounwind { ; LA32: # %bb.0: # %entry ; LA32-NEXT: addi.w $sp, $sp, -16 ; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: bl %plt(__gnu_h2f_ieee) +; LA32-NEXT: bl %plt(__extendhfsf2) ; LA32-NEXT: bl %plt(__fixsfdi) ; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload ; LA32-NEXT: addi.w $sp, $sp, 16 @@ -449,7 +449,7 @@ define i64 @test_half_to_i64(half %a) nounwind { ; LA64: # %bb.0: # %entry ; LA64-NEXT: addi.d $sp, $sp, -16 ; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: bl %plt(__gnu_h2f_ieee) +; LA64-NEXT: bl %plt(__extendhfsf2) ; LA64-NEXT: ftintrz.l.s $fa0, $fa0 ; LA64-NEXT: movfr2gr.d $a0, $fa0 ; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/fp16-promote.ll b/llvm/test/CodeGen/Mips/fp16-promote.ll index 47bace9f5c03f..c03ca3a6d78dd 100644 --- a/llvm/test/CodeGen/Mips/fp16-promote.ll +++ b/llvm/test/CodeGen/Mips/fp16-promote.ll @@ -11,12 +11,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; MIPS32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $16, $4 ; MIPS32-NEXT: lhu $4, 0($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f0, $f20 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload @@ -33,12 +33,12 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; MIPS64-NEXT: sd $16, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $4 ; MIPS64-NEXT: lhu $4, 0($5) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f0, $f24 ; MIPS64-NEXT: sh $2, 0($16) ; MIPS64-NEXT: ld $16, 8($sp) # 8-byte Folded Reload @@ -59,7 +59,7 @@ define float @test_fpext_float(ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: lhu $4, 0($4) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: jr $ra @@ -70,7 +70,7 @@ define float @test_fpext_float(ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: lhu $4, 0($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload ; MIPS64-NEXT: jr $ra @@ -86,7 +86,7 @@ define double @test_fpext_double(ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: lhu $4, 0($4) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload @@ -98,7 +98,7 @@ define double @test_fpext_double(ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: lhu $4, 0($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: ld $ra, 8($sp) # 8-byte Folded Reload @@ -115,7 +115,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; MIPS32-NEXT: addiu $sp, $sp, -24 ; MIPS32-NEXT: sw $ra, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: sw $16, 16($sp) # 4-byte Folded Spill -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 16($sp) # 4-byte Folded Reload @@ -128,7 +128,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; MIPS64-NEXT: daddiu $sp, $sp, -16 ; MIPS64-NEXT: sd $ra, 8($sp) # 8-byte Folded Spill ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: move $16, $5 ; MIPS64-NEXT: sh $2, 0($16) ; MIPS64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload @@ -180,18 +180,18 @@ define <4 x float> @test_vec_fpext_float(ptr %p) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $17, $4 ; MIPS32-NEXT: lhu $4, 6($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: lhu $4, 4($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: swc1 $f0, 12($17) ; MIPS32-NEXT: swc1 $f0, 8($17) ; MIPS32-NEXT: lhu $4, 2($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: swc1 $f0, 4($17) ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: swc1 $f0, 0($17) ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload @@ -209,21 +209,21 @@ define <4 x float> @test_vec_fpext_float(ptr %p) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $4 ; MIPS64-NEXT: lhu $4, 2($4) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: lhu $4, 6($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mfc1 $17, $f0 ; MIPS64-NEXT: mfc1 $18, $f0 ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: dsll $17, $17, 32 ; MIPS64-NEXT: mfc1 $1, $f0 ; MIPS64-NEXT: dsll $1, $1, 32 ; MIPS64-NEXT: dsrl $1, $1, 32 ; MIPS64-NEXT: or $17, $1, $17 ; MIPS64-NEXT: lhu $4, 4($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: dsll $18, $18, 32 ; MIPS64-NEXT: mfc1 $1, $f0 ; MIPS64-NEXT: dsll $1, $1, 32 @@ -251,21 +251,21 @@ define <4 x double> @test_vec_fpext_double(ptr %p) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $17, $4 ; MIPS32-NEXT: lhu $4, 6($5) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $16, $5 ; MIPS32-NEXT: lhu $4, 4($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: mov.s $f20, $f0 ; MIPS32-NEXT: lhu $4, 2($16) ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: cvt.d.s $f2, $f20 ; MIPS32-NEXT: sdc1 $f2, 24($17) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: sdc1 $f0, 16($17) ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: sdc1 $f0, 8($17) ; MIPS32-NEXT: lhu $4, 0($16) -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: nop ; MIPS32-NEXT: cvt.d.s $f0, $f0 ; MIPS32-NEXT: sdc1 $f0, 0($17) @@ -285,21 +285,21 @@ define <4 x double> @test_vec_fpext_double(ptr %p) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $17, $4 ; MIPS64-NEXT: lhu $4, 6($5) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: move $16, $5 ; MIPS64-NEXT: lhu $4, 4($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: mov.s $f24, $f0 ; MIPS64-NEXT: lhu $4, 2($16) ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: cvt.d.s $f1, $f24 ; MIPS64-NEXT: sdc1 $f1, 24($17) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sdc1 $f0, 16($17) ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: sdc1 $f0, 8($17) ; MIPS64-NEXT: lhu $4, 0($16) -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: nop ; MIPS64-NEXT: cvt.d.s $f0, $f0 ; MIPS64-NEXT: sdc1 $f0, 0($17) @@ -326,18 +326,18 @@ define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind { ; MIPS32-NEXT: move $16, $7 ; MIPS32-NEXT: move $17, $5 ; MIPS32-NEXT: move $18, $4 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $6, $f12 ; MIPS32-NEXT: move $19, $2 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $16, $f12 ; MIPS32-NEXT: mtc1 $17, $f12 ; MIPS32-NEXT: lw $16, 56($sp) ; MIPS32-NEXT: sh $2, 6($16) -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: sh $19, 4($16) ; MIPS32-NEXT: sh $2, 2($16) -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: mtc1 $18, $f12 ; MIPS32-NEXT: sh $2, 0($16) ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload @@ -360,22 +360,22 @@ define void @test_vec_fptrunc_float(<4 x float> %a, ptr %p) nounwind { ; MIPS64-NEXT: move $17, $5 ; MIPS64-NEXT: move $18, $4 ; MIPS64-NEXT: sll $1, $18, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: move $19, $2 ; MIPS64-NEXT: sll $1, $17, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: dsrl $1, $17, 32 ; MIPS64-NEXT: sll $1, $1, 0 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: sh $2, 4($16) -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: sh $19, 0($16) ; MIPS64-NEXT: sh $2, 6($16) ; MIPS64-NEXT: dsrl $1, $18, 32 ; MIPS64-NEXT: sll $1, $1, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: mtc1 $1, $f12 ; MIPS64-NEXT: sh $2, 2($16) ; MIPS64-NEXT: ld $16, 8($sp) # 8-byte Folded Reload @@ -484,19 +484,19 @@ define half @test_fadd_fadd(half %a, half %b, half %c) nounwind { ; MIPS32-NEXT: sw $16, 20($sp) # 4-byte Folded Spill ; MIPS32-NEXT: move $16, $6 ; MIPS32-NEXT: move $17, $4 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $5 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $17 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f0, $f20 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $2 ; MIPS32-NEXT: mov.s $f20, $f0 -; MIPS32-NEXT: jal __gnu_h2f_ieee +; MIPS32-NEXT: jal __extendhfsf2 ; MIPS32-NEXT: move $4, $16 -; MIPS32-NEXT: jal __gnu_f2h_ieee +; MIPS32-NEXT: jal __truncsfhf2 ; MIPS32-NEXT: add.s $f12, $f20, $f0 ; MIPS32-NEXT: lw $16, 20($sp) # 4-byte Folded Reload ; MIPS32-NEXT: lw $17, 24($sp) # 4-byte Folded Reload @@ -514,19 +514,19 @@ define half @test_fadd_fadd(half %a, half %b, half %c) nounwind { ; MIPS64-NEXT: sd $16, 0($sp) # 8-byte Folded Spill ; MIPS64-NEXT: move $16, $6 ; MIPS64-NEXT: move $17, $4 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $5, 0 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $17, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f0, $f24 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $2, 0 ; MIPS64-NEXT: mov.s $f24, $f0 -; MIPS64-NEXT: jal __gnu_h2f_ieee +; MIPS64-NEXT: jal __extendhfsf2 ; MIPS64-NEXT: sll $4, $16, 0 -; MIPS64-NEXT: jal __gnu_f2h_ieee +; MIPS64-NEXT: jal __truncsfhf2 ; MIPS64-NEXT: add.s $f12, $f24, $f0 ; MIPS64-NEXT: ld $16, 0($sp) # 8-byte Folded Reload ; MIPS64-NEXT: ld $17, 8($sp) # 8-byte Folded Reload diff --git a/llvm/test/CodeGen/Mips/ldexp.ll b/llvm/test/CodeGen/Mips/ldexp.ll index 3753fd567a3ed..4debc6ddce4aa 100644 --- a/llvm/test/CodeGen/Mips/ldexp.ll +++ b/llvm/test/CodeGen/Mips/ldexp.ll @@ -128,12 +128,12 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; SOFT-NEXT: .cfi_offset 31, -4 ; SOFT-NEXT: .cfi_offset 16, -8 ; SOFT-NEXT: move $16, $5 -; SOFT-NEXT: jal __gnu_h2f_ieee +; SOFT-NEXT: jal __extendhfsf2 ; SOFT-NEXT: andi $4, $4, 65535 ; SOFT-NEXT: move $4, $2 ; SOFT-NEXT: jal ldexpf ; SOFT-NEXT: move $5, $16 -; SOFT-NEXT: jal __gnu_f2h_ieee +; SOFT-NEXT: jal __truncsfhf2 ; SOFT-NEXT: move $4, $2 ; SOFT-NEXT: lw $16, 16($sp) # 4-byte Folded Reload ; SOFT-NEXT: lw $ra, 20($sp) # 4-byte Folded Reload diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll new file mode 100644 index 0000000000000..50dc93325c286 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-cp.ll @@ -0,0 +1,348 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1 +define void @test_tcgen05_cp_64x128_v1(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2 +define void @test_tcgen05_cp_64x128_v2(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128 +define void @test_tcgen05_cp_32x128(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + + +; CHECK-LABEL: test_tcgen05_cp_128x128b +define void @test_tcgen05_cp_128x128b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x256b +define void @test_tcgen05_cp_128x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b +define void @test_tcgen05_cp_4x256b(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b6x16_p32 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32 +define void @test_tcgen05_cp_128x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32 +define void @test_tcgen05_cp_4x256b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32 +define void @test_tcgen05_cp_128x128b_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32 +define void @test_tcgen05_cp_64x128_v1_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32 +define void @test_tcgen05_cp_64x128_v2_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32 +define void @test_tcgen05_cp_32x128_b6x16_p32(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b6x16_p32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b6x16_p32_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b6x16_p32_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b6x16_p32 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b6x16_p32.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; With src_fmt as b4x16_p64 +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64 +define void @test_tcgen05_cp_128x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64 +define void @test_tcgen05_cp_4x256b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_4x256b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_4x256b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_4x256b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.4x256b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.4x256b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64 +define void @test_tcgen05_cp_128x128b_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_128x128b_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_128x128b_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_128x128b_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.128x128b.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.128x128b.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64 +define void @test_tcgen05_cp_64x128_v1_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v1_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v1_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::02_13.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_02_13.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64 +define void @test_tcgen05_cp_64x128_v2_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_64x128_v2_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_64x128_v2_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.64x128b.warpx2::01_23.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.64x128b_warpx2_01_23.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} + +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64 +define void @test_tcgen05_cp_32x128_b4x16_p64(ptr addrspace(6) %addr, i64 %sdesc) { +; CHECK-LABEL: test_tcgen05_cp_32x128_b4x16_p64( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_cp_32x128_b4x16_p64_param_0]; +; CHECK-NEXT: ld.param.u64 %rd1, [test_tcgen05_cp_32x128_b4x16_p64_param_1]; +; CHECK-NEXT: tcgen05.cp.cta_group::1.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: tcgen05.cp.cta_group::2.32x128b.warpx4.b8x16.b4x16_p64 [%r1], %rd1; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg1(ptr addrspace(6) %addr, i64 %sdesc) + call void @llvm.nvvm.tcgen05.cp.32x128b_warpx4.b4x16_p64.cg2(ptr addrspace(6) %addr, i64 %sdesc) + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll new file mode 100644 index 0000000000000..13a45b9d86dcf --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/tcgen05-shift.ll @@ -0,0 +1,23 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-12.8 %{ llc < %s -march=nvptx64 -mcpu=sm_100a -mattr=+ptx86 | %ptxas-verify -arch=sm_100a %} + +declare void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) +declare void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + +; CHECK-LABEL: test_tcgen05_shift +define void @test_tcgen05_shift(ptr addrspace(6) %tmem_addr) { +; CHECK-LABEL: test_tcgen05_shift( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [test_tcgen05_shift_param_0]; +; CHECK-NEXT: tcgen05.shift.cta_group::1.down [%r1]; +; CHECK-NEXT: tcgen05.shift.cta_group::2.down [%r1]; +; CHECK-NEXT: ret; + call void @llvm.nvvm.tcgen05.shift.down.cg1(ptr addrspace(6) %tmem_addr) + call void @llvm.nvvm.tcgen05.shift.down.cg2(ptr addrspace(6) %tmem_addr) + + ret void +} diff --git a/llvm/test/CodeGen/PowerPC/adde_return_type.ll b/llvm/test/CodeGen/PowerPC/adde_return_type.ll index 47c5efc35afc6..7ce11079a6267 100644 --- a/llvm/test/CodeGen/PowerPC/adde_return_type.ll +++ b/llvm/test/CodeGen/PowerPC/adde_return_type.ll @@ -3,7 +3,7 @@ ; RUN: < %s -o /dev/null 2>&1 | FileCheck %s define i64 @testAddeReturnType(i64 %X, i64 %Z) { -; CHECK: Legally typed node: {{.*}}: i64,i1 = uaddo {{.*}} +; CHECK: Legally typed node: {{.*}}: i64,glue = adde {{.*}} %cmp = icmp ne i64 %Z, 0 %conv1 = zext i1 %cmp to i64 %add = add nsw i64 %conv1, %X diff --git a/llvm/test/CodeGen/PowerPC/addegluecrash.ll b/llvm/test/CodeGen/PowerPC/addegluecrash.ll index 7cd94c0e4c2d5..a711b09b9bdfd 100644 --- a/llvm/test/CodeGen/PowerPC/addegluecrash.ll +++ b/llvm/test/CodeGen/PowerPC/addegluecrash.ll @@ -9,20 +9,20 @@ define void @bn_mul_comba8(ptr nocapture %r, ptr nocapture readonly %a, ptr noca ; CHECK-NEXT: std 4, -8(1) # 8-byte Folded Spill ; CHECK-NEXT: mr 4, 3 ; CHECK-NEXT: ld 3, -8(1) # 8-byte Folded Reload -; CHECK-NEXT: ld 6, 0(3) -; CHECK-NEXT: ld 11, 0(5) -; CHECK-NEXT: mulhdu 8, 11, 6 +; CHECK-NEXT: ld 9, 0(3) +; CHECK-NEXT: ld 8, 0(5) +; CHECK-NEXT: mulhdu 7, 8, 9 ; CHECK-NEXT: ld 3, 8(3) -; CHECK-NEXT: mulld 7, 3, 6 -; CHECK-NEXT: addc 9, 7, 8 -; CHECK-NEXT: ld 10, 8(5) -; CHECK-NEXT: mulhdu 5, 10, 11 -; CHECK-NEXT: mulld 10, 10, 11 -; CHECK-NEXT: addc 9, 9, 10 +; CHECK-NEXT: mulld 6, 3, 9 +; CHECK-NEXT: mulhdu 3, 3, 9 +; CHECK-NEXT: addc 6, 6, 7 +; CHECK-NEXT: addze 3, 3 +; CHECK-NEXT: ld 5, 8(5) +; CHECK-NEXT: mulld 7, 5, 8 +; CHECK-NEXT: mulhdu 5, 5, 8 +; CHECK-NEXT: addc 6, 6, 7 ; CHECK-NEXT: addze 5, 5 -; CHECK-NEXT: addc 7, 7, 8 -; CHECK-NEXT: mulhdu 3, 3, 6 -; CHECK-NEXT: adde 3, 5, 3 +; CHECK-NEXT: add 3, 5, 3 ; CHECK-NEXT: cmpld 3, 5 ; CHECK-NEXT: crmove 20, 0 ; CHECK-NEXT: li 5, 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll index aead5762d0921..501227c9072c4 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi-mir.ll @@ -1103,13 +1103,13 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r11 = LWZ 0, %fixed-stack.0 :: (load (s32) from %fixed-stack.0) ; 32BIT-NEXT: renamable $r12 = LWZ 0, %fixed-stack.4 :: (load (s32) from %fixed-stack.4) ; 32BIT-NEXT: renamable $r0 = LBZ 3, %fixed-stack.1 :: (load (s8) from %fixed-stack.1 + 3, basealign 4) - ; 32BIT-NEXT: renamable $r31 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) - ; 32BIT-NEXT: renamable $r30 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) + ; 32BIT-NEXT: renamable $r31 = LWZ 4, %fixed-stack.3 :: (load (s32) from %fixed-stack.3 + 4, basealign 16) + ; 32BIT-NEXT: renamable $r30 = LWZ 0, %fixed-stack.3 :: (load (s32) from %fixed-stack.3, align 16) ; 32BIT-NEXT: renamable $r29 = LWZ 0, %fixed-stack.5 :: (load (s32) from %fixed-stack.5, align 8) ; 32BIT-NEXT: renamable $r28 = LBZ 3, %fixed-stack.6 :: (load (s8) from %fixed-stack.6 + 3, basealign 4) ; 32BIT-NEXT: renamable $r27 = LHA 2, %fixed-stack.7 :: (load (s16) from %fixed-stack.7 + 2, basealign 4) - ; 32BIT-NEXT: renamable $r26 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) - ; 32BIT-NEXT: renamable $r25 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r26 = LWZ 4, %fixed-stack.9 :: (load (s32) from %fixed-stack.9 + 4, basealign 8) + ; 32BIT-NEXT: renamable $r25 = LWZ 0, %fixed-stack.9 :: (load (s32) from %fixed-stack.9, align 8) ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r4 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r5 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r6 @@ -1120,8 +1120,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r9 ; 32BIT-NEXT: renamable $r3 = nsw ADD4 killed renamable $r3, killed renamable $r10 ; 32BIT-NEXT: renamable $r6 = SRAWI renamable $r3, 31, implicit-def dead $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r25, implicit-def $carry - ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r26, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r26, implicit-def $carry + ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r25, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r7 = SRAWI renamable $r27, 31, implicit-def dead $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r27, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDE killed renamable $r6, killed renamable $r7, implicit-def dead $carry, implicit $carry @@ -1131,8 +1131,8 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r6, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r12, implicit-def $carry ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r6, killed renamable $r4, implicit-def dead $carry, implicit $carry - ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r30, implicit-def $carry - ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r31, implicit-def dead $carry, implicit $carry + ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r31, implicit-def $carry + ; 32BIT-NEXT: renamable $r4 = ADDE killed renamable $r4, killed renamable $r30, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r3 = ADDC killed renamable $r3, killed renamable $r0, implicit-def $carry ; 32BIT-NEXT: renamable $r6 = ADDZE killed renamable $r4, implicit-def dead $carry, implicit $carry ; 32BIT-NEXT: renamable $r4 = ADDC killed renamable $r3, killed renamable $r11, implicit-def $carry diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll index 8f33f5ef863e6..79c59e925302a 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-abi.ll @@ -1213,14 +1213,14 @@ define i64 @test_ints_stack(i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6 ; ASM32PWR4-NEXT: addc 3, 3, 6 ; ASM32PWR4-NEXT: addze 6, 7 ; ASM32PWR4-NEXT: addc 3, 3, 9 -; ASM32PWR4-NEXT: lwz 7, 84(1) +; ASM32PWR4-NEXT: lwz 5, 84(1) ; ASM32PWR4-NEXT: addze 6, 6 ; ASM32PWR4-NEXT: addc 3, 3, 31 -; ASM32PWR4-NEXT: lwz 5, 80(1) +; ASM32PWR4-NEXT: lwz 7, 80(1) ; ASM32PWR4-NEXT: adde 6, 6, 30 -; ASM32PWR4-NEXT: addc 3, 3, 7 +; ASM32PWR4-NEXT: addc 3, 3, 5 ; ASM32PWR4-NEXT: lbz 8, 91(1) -; ASM32PWR4-NEXT: adde 5, 6, 5 +; ASM32PWR4-NEXT: adde 5, 6, 7 ; ASM32PWR4-NEXT: addc 3, 3, 8 ; ASM32PWR4-NEXT: lbz 6, 103(1) ; ASM32PWR4-NEXT: addze 5, 5 diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll index 9b1893b111556..f1bf7c262317d 100644 --- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll +++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-split.ll @@ -36,17 +36,17 @@ entry: ; CHECK32: bb.0.entry: ; CHECK32-NEXT: liveins: $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10 -; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 80, %fixed-stack.0 +; CHECK32: renamable $r[[REG1:[0-9]+]] = LWZ 84, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r3, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r4, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4 -; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 84, %fixed-stack.0 +; CHECK32: renamable $r[[REG2:[0-9]+]] = LWZ 80, %fixed-stack.0 ; CHECK32-DAG: STW killed renamable $r5, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8 ; CHECK32-DAG: STW killed renamable $r6, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12 ; CHECK32-DAG: STW renamable $r7, 16, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 16 ; CHECK32-DAG: STW renamable $r8, 20, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 20 ; CHECK32-DAG: STW killed renamable $r9, 24, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 24 -; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG2]], implicit-def $carry -; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG1]], implicit-def dead $carry, implicit killed $carry +; CHECK32: renamable $r4 = ADDC killed renamable $r8, killed renamable $r[[REG1]], implicit-def $carry +; CHECK32: renamable $r3 = ADDE killed renamable $r7, killed renamable $r[[REG2]], implicit-def dead $carry, implicit killed $carry ; CHECK32 STW killed renamable $r10, 28, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 28 ; CHECK32: BLR implicit $lr, implicit $rm, implicit $r3, implicit $r4 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll index 5f471ce83828a..53a7cb0aad9ee 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-gd-longlong.ll @@ -325,12 +325,12 @@ define i64 @loadsTGInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 0(3) -; SMALL32-NEXT: lwz 3, 4(3) +; SMALL32-NEXT: lwz 5, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) +; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 3 -; SMALL32-NEXT: adde 3, 7, 5 +; SMALL32-NEXT: addc 4, 6, 5 +; SMALL32-NEXT: adde 3, 7, 3 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -346,14 +346,14 @@ define i64 @loadsTGInit() #1 { ; LARGE32-NEXT: lwz 3, L..C0@l(3) ; LARGE32-NEXT: lwz 4, L..C1@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 5, 0(3) -; LARGE32-NEXT: lwz 3, 4(3) -; LARGE32-NEXT: addis 4, L..C7@u(2) -; LARGE32-NEXT: lwz 4, L..C7@l(4) -; LARGE32-NEXT: lwz 6, 4(4) -; LARGE32-NEXT: lwz 7, 0(4) -; LARGE32-NEXT: addc 4, 6, 3 -; LARGE32-NEXT: adde 3, 7, 5 +; LARGE32-NEXT: lwz 4, 4(3) +; LARGE32-NEXT: lwz 3, 0(3) +; LARGE32-NEXT: addis 5, L..C7@u(2) +; LARGE32-NEXT: lwz 5, L..C7@l(5) +; LARGE32-NEXT: lwz 6, 4(5) +; LARGE32-NEXT: lwz 5, 0(5) +; LARGE32-NEXT: addc 4, 6, 4 +; LARGE32-NEXT: adde 3, 5, 3 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 @@ -589,12 +589,12 @@ define i64 @loadsTWInit() #1 { ; SMALL32-NEXT: stw 0, 40(1) ; SMALL32-NEXT: bla .__tls_get_addr[PR] ; SMALL32-NEXT: lwz 4, L..C7(2) # @GInit -; SMALL32-NEXT: lwz 5, 0(3) -; SMALL32-NEXT: lwz 3, 4(3) +; SMALL32-NEXT: lwz 5, 4(3) ; SMALL32-NEXT: lwz 6, 4(4) +; SMALL32-NEXT: lwz 3, 0(3) ; SMALL32-NEXT: lwz 7, 0(4) -; SMALL32-NEXT: addc 4, 6, 3 -; SMALL32-NEXT: adde 3, 7, 5 +; SMALL32-NEXT: addc 4, 6, 5 +; SMALL32-NEXT: adde 3, 7, 3 ; SMALL32-NEXT: addi 1, 1, 32 ; SMALL32-NEXT: lwz 0, 8(1) ; SMALL32-NEXT: mtlr 0 @@ -610,14 +610,14 @@ define i64 @loadsTWInit() #1 { ; LARGE32-NEXT: lwz 3, L..C5@l(3) ; LARGE32-NEXT: lwz 4, L..C6@l(4) ; LARGE32-NEXT: bla .__tls_get_addr[PR] -; LARGE32-NEXT: lwz 5, 0(3) -; LARGE32-NEXT: lwz 3, 4(3) -; LARGE32-NEXT: addis 4, L..C7@u(2) -; LARGE32-NEXT: lwz 4, L..C7@l(4) -; LARGE32-NEXT: lwz 6, 4(4) -; LARGE32-NEXT: lwz 7, 0(4) -; LARGE32-NEXT: addc 4, 6, 3 -; LARGE32-NEXT: adde 3, 7, 5 +; LARGE32-NEXT: lwz 4, 4(3) +; LARGE32-NEXT: lwz 3, 0(3) +; LARGE32-NEXT: addis 5, L..C7@u(2) +; LARGE32-NEXT: lwz 5, L..C7@l(5) +; LARGE32-NEXT: lwz 6, 4(5) +; LARGE32-NEXT: lwz 5, 0(5) +; LARGE32-NEXT: addc 4, 6, 4 +; LARGE32-NEXT: adde 3, 5, 3 ; LARGE32-NEXT: addi 1, 1, 32 ; LARGE32-NEXT: lwz 0, 8(1) ; LARGE32-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll index 533c866eb4e12..c2d7325107a84 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-ldst-longlong.ll @@ -304,15 +304,15 @@ define i64 @loadITLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C0(r2) # target-flags(ppc-tprel) @IThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -327,14 +327,14 @@ define i64 @loadITLUninit2() { ; LARGE32-NEXT: lwz r4, L..C0@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -424,15 +424,15 @@ define i64 @loadITLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C1(r2) # target-flags(ppc-tprel) @IThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -447,14 +447,14 @@ define i64 @loadITLInit2() { ; LARGE32-NEXT: lwz r4, L..C1@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -544,15 +544,15 @@ define i64 @loadTLUninit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C2(r2) # target-flags(ppc-tprel) @ThreadLocalVarUninit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -567,14 +567,14 @@ define i64 @loadTLUninit2() { ; LARGE32-NEXT: lwz r4, L..C2@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 @@ -664,15 +664,15 @@ define i64 @loadTLInit2() { ; SMALL32-NEXT: stwu r1, -32(r1) ; SMALL32-NEXT: lwz r4, L..C3(r2) # target-flags(ppc-tprel) @ThreadLocalVarInit ; SMALL32-NEXT: bla .__get_tpointer[PR] +; SMALL32-NEXT: lwz r5, L..C4(r2) # @VarInit ; SMALL32-NEXT: stw r0, 40(r1) ; SMALL32-NEXT: add r3, r3, r4 -; SMALL32-NEXT: lwz r4, L..C4(r2) # @VarInit -; SMALL32-NEXT: lwz r5, 0(r3) -; SMALL32-NEXT: lwz r3, 4(r3) -; SMALL32-NEXT: lwz r6, 0(r4) -; SMALL32-NEXT: lwz r4, 4(r4) -; SMALL32-NEXT: addc r4, r4, r3 -; SMALL32-NEXT: adde r3, r6, r5 +; SMALL32-NEXT: lwz r6, 4(r5) +; SMALL32-NEXT: lwz r5, 0(r5) +; SMALL32-NEXT: lwz r4, 4(r3) +; SMALL32-NEXT: lwz r3, 0(r3) +; SMALL32-NEXT: addc r4, r6, r4 +; SMALL32-NEXT: adde r3, r5, r3 ; SMALL32-NEXT: addi r1, r1, 32 ; SMALL32-NEXT: lwz r0, 8(r1) ; SMALL32-NEXT: mtlr r0 @@ -687,14 +687,14 @@ define i64 @loadTLInit2() { ; LARGE32-NEXT: lwz r4, L..C3@l(r3) ; LARGE32-NEXT: bla .__get_tpointer[PR] ; LARGE32-NEXT: add r3, r3, r4 -; LARGE32-NEXT: lwz r5, 0(r3) -; LARGE32-NEXT: lwz r3, 4(r3) -; LARGE32-NEXT: addis r4, L..C4@u(r2) -; LARGE32-NEXT: lwz r4, L..C4@l(r4) -; LARGE32-NEXT: lwz r6, 0(r4) -; LARGE32-NEXT: lwz r4, 4(r4) -; LARGE32-NEXT: addc r4, r4, r3 -; LARGE32-NEXT: adde r3, r6, r5 +; LARGE32-NEXT: lwz r4, 4(r3) +; LARGE32-NEXT: lwz r3, 0(r3) +; LARGE32-NEXT: addis r5, L..C4@u(r2) +; LARGE32-NEXT: lwz r5, L..C4@l(r5) +; LARGE32-NEXT: lwz r6, 4(r5) +; LARGE32-NEXT: lwz r5, 0(r5) +; LARGE32-NEXT: addc r4, r6, r4 +; LARGE32-NEXT: adde r3, r5, r3 ; LARGE32-NEXT: addi r1, r1, 32 ; LARGE32-NEXT: lwz r0, 8(r1) ; LARGE32-NEXT: mtlr r0 diff --git a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll index 268402170063e..6c0ea782c2a38 100644 --- a/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll +++ b/llvm/test/CodeGen/PowerPC/aix-tls-le-xcoff-reloc-large32.ll @@ -290,16 +290,16 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} bla 0 ; DIS-NEXT: {{0*}}[[#ADDR]]: R_RBA (idx: [[#NFA+1]]) .__get_tpointer[PR] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 4, 2, 0 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} addis 5, 2, 0 ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCU (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 8(4) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 8(5) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+25]]) VarInit[TE] -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 0(4) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(4) -; DIS-NEXT: addc 4, 4, 3 -; DIS-NEXT: adde 3, 6, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 6, 4(5) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(5) +; DIS-NEXT: addc 4, 6, 4 +; DIS-NEXT: adde 3, 5, 3 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 @@ -324,10 +324,10 @@ entry: ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 12(4) ; DIS-NEXT: {{0*}}[[#ADDR + 2]]: R_TOCL (idx: [[#NFA+27]]) IThreadLocalVarUninit2[TE] ; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} add 3, 3, 4 -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 5, 0(3) -; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 4(3) -; DIS-NEXT: addic 4, 3, 1 -; DIS-NEXT: addze 3, 5 +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 4, 4(3) +; DIS-NEXT: [[#%x, ADDR:]]: {{.*}} lwz 3, 0(3) +; DIS-NEXT: addic 4, 4, 1 +; DIS-NEXT: addze 3, 3 ; DIS-NEXT: addi 1, 1, 32 ; DIS-NEXT: lwz 0, 8(1) ; DIS-NEXT: mtlr 0 diff --git a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll index 4f00cff83942a..0ff2f28207ed4 100644 --- a/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll +++ b/llvm/test/CodeGen/PowerPC/atomicrmw-cond-sub-clamp.ll @@ -357,10 +357,10 @@ define i64 @atomicrmw_usub_sat_i64(ptr %ptr, i64 %val) { ; CHECK-NEXT: .LBB7_2: # %atomicrmw.start ; CHECK-NEXT: # =>This Loop Header: Depth=1 ; CHECK-NEXT: # Child Loop BB7_4 Depth 2 -; CHECK-NEXT: subc 5, 6, 4 +; CHECK-NEXT: sub 5, 6, 4 +; CHECK-NEXT: cmpld 5, 6 ; CHECK-NEXT: li 7, 0 -; CHECK-NEXT: addze. 8, 7 -; CHECK-NEXT: beq 0, .LBB7_4 +; CHECK-NEXT: bgt 0, .LBB7_4 ; CHECK-NEXT: # %bb.3: # %atomicrmw.start ; CHECK-NEXT: # ; CHECK-NEXT: mr 7, 5 diff --git a/llvm/test/CodeGen/PowerPC/atomics.ll b/llvm/test/CodeGen/PowerPC/atomics.ll index 07bdbb25a746a..24e71c87414e8 100644 --- a/llvm/test/CodeGen/PowerPC/atomics.ll +++ b/llvm/test/CodeGen/PowerPC/atomics.ll @@ -476,7 +476,7 @@ define half @load_atomic_f16__seq_cst(ptr %ptr) { ; PPC32-NEXT: cmpw cr7, r3, r3 ; PPC32-NEXT: bne- cr7, .+4 ; PPC32-NEXT: isync -; PPC32-NEXT: bl __gnu_h2f_ieee +; PPC32-NEXT: bl __extendhfsf2 ; PPC32-NEXT: lwz r0, 20(r1) ; PPC32-NEXT: addi r1, r1, 16 ; PPC32-NEXT: mtlr r0 @@ -494,7 +494,7 @@ define half @load_atomic_f16__seq_cst(ptr %ptr) { ; PPC64-NEXT: cmpd cr7, r3, r3 ; PPC64-NEXT: bne- cr7, .+4 ; PPC64-NEXT: isync -; PPC64-NEXT: bl __gnu_h2f_ieee +; PPC64-NEXT: bl __extendhfsf2 ; PPC64-NEXT: nop ; PPC64-NEXT: addi r1, r1, 112 ; PPC64-NEXT: ld r0, 16(r1) @@ -582,7 +582,7 @@ define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { ; PPC32-NEXT: .cfi_offset r30, -8 ; PPC32-NEXT: stw r30, 8(r1) # 4-byte Folded Spill ; PPC32-NEXT: mr r30, r3 -; PPC32-NEXT: bl __gnu_f2h_ieee +; PPC32-NEXT: bl __truncsfhf2 ; PPC32-NEXT: sync ; PPC32-NEXT: sth r3, 0(r30) ; PPC32-NEXT: lwz r30, 8(r1) # 4-byte Folded Reload @@ -601,7 +601,7 @@ define void @store_atomic_f16__seq_cst(ptr %ptr, half %val1) { ; PPC64-NEXT: .cfi_offset r30, -16 ; PPC64-NEXT: std r30, 112(r1) # 8-byte Folded Spill ; PPC64-NEXT: mr r30, r3 -; PPC64-NEXT: bl __gnu_f2h_ieee +; PPC64-NEXT: bl __truncsfhf2 ; PPC64-NEXT: nop ; PPC64-NEXT: sync ; PPC64-NEXT: sth r3, 0(r30) diff --git a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll index 29e7a16739864..34091ba46c3f6 100644 --- a/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll +++ b/llvm/test/CodeGen/PowerPC/cvt_i64_to_fp.ll @@ -12,11 +12,11 @@ define double @postinctodbl(ptr nocapture %llp) #0 { ; CHECK-NEXT: addic 4, 4, 1 ; CHECK-NEXT: lwz 5, 0(3) ; CHECK-NEXT: stw 5, 8(1) +; CHECK-NEXT: addze 5, 5 ; CHECK-NEXT: lfd 0, 8(1) -; CHECK-NEXT: stw 4, 4(3) -; CHECK-NEXT: addze 4, 5 +; CHECK-NEXT: stw 5, 0(3) ; CHECK-NEXT: fcfid 1, 0 -; CHECK-NEXT: stw 4, 0(3) +; CHECK-NEXT: stw 4, 4(3) ; CHECK-NEXT: addi 1, 1, 16 ; CHECK-NEXT: blr entry: diff --git a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll index 4256933300243..50f05cca80458 100644 --- a/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll +++ b/llvm/test/CodeGen/PowerPC/handle-f16-storage-type.ll @@ -17,7 +17,7 @@ define dso_local double @loadd(ptr nocapture readonly %a) local_unnamed_addr #0 ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 2(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -37,7 +37,7 @@ define dso_local double @loadd(ptr nocapture readonly %a) local_unnamed_addr #0 ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 2(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -61,7 +61,7 @@ define dso_local float @loadf(ptr nocapture readonly %a) local_unnamed_addr #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 2(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -81,7 +81,7 @@ define dso_local float @loadf(ptr nocapture readonly %a) local_unnamed_addr #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 2(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) @@ -130,9 +130,9 @@ define dso_local void @stored(ptr nocapture %a, double %b) local_unnamed_addr #0 ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -156,7 +156,7 @@ define dso_local void @storef(ptr nocapture %a, float %b) local_unnamed_addr #0 ; P8-NEXT: stdu r1, -48(r1) ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r3 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -179,12 +179,12 @@ define dso_local void @storef(ptr nocapture %a, float %b) local_unnamed_addr #0 ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: clrldi r3, r4, 32 ; SOFT-NEXT: std r0, 64(r1) -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -220,9 +220,9 @@ define void @test_load_store(ptr %in, ptr %out) #0 { ; SOFT-NEXT: std r0, 64(r1) ; SOFT-NEXT: mr r30, r4 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -279,7 +279,7 @@ define float @test_extend32(ptr %addr) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -298,7 +298,7 @@ define float @test_extend32(ptr %addr) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) @@ -315,7 +315,7 @@ define double @test_extend64(ptr %addr) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: addi r1, r1, 32 ; P8-NEXT: ld r0, 16(r1) @@ -334,7 +334,7 @@ define double @test_extend64(ptr %addr) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -354,7 +354,7 @@ define void @test_trunc32(float %in, ptr %addr) #0 { ; P8-NEXT: stdu r1, -48(r1) ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -377,12 +377,12 @@ define void @test_trunc32(float %in, ptr %addr) #0 { ; SOFT-NEXT: clrldi r3, r3, 32 ; SOFT-NEXT: std r0, 64(r1) ; SOFT-NEXT: mr r30, r4 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -427,9 +427,9 @@ define void @test_trunc64(double %in, ptr %addr) #0 { ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -448,7 +448,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xscvdpsxds f0, f1 ; P8-NEXT: mffprd r3, f0 @@ -472,7 +472,7 @@ define i64 @test_fptosi_i64(ptr %p) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __fixsfdi ; SOFT-NEXT: nop @@ -494,7 +494,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 { ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 ; P8-NEXT: xscvsxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -522,12 +522,12 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 { ; SOFT-NEXT: bl __floatdisf ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -546,7 +546,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) ; P8-NEXT: lhz r3, 0(r3) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xscvdpuxds f0, f1 ; P8-NEXT: mffprd r3, f0 @@ -570,7 +570,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: std r0, 48(r1) ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __fixunssfdi ; SOFT-NEXT: nop @@ -592,7 +592,7 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 { ; P8-NEXT: std r0, 64(r1) ; P8-NEXT: mr r30, r4 ; P8-NEXT: xscvuxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 0(r30) ; P8-NEXT: addi r1, r1, 48 @@ -619,12 +619,12 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 { ; SOFT-NEXT: mr r30, r4 ; SOFT-NEXT: bl __floatundisf ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 48 @@ -651,19 +651,19 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 { ; P8-NEXT: stxvd2x vs62, r1, r4 # 16-byte Folded Spill ; P8-NEXT: li r4, 80 ; P8-NEXT: stxvd2x vs63, r1, r4 # 16-byte Folded Spill -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 2(r30) ; P8-NEXT: xxlor vs63, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 4(r30) ; P8-NEXT: xxlor vs62, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 0(r30) ; P8-NEXT: xxlor vs61, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: li r3, 80 ; P8-NEXT: xxmrghd vs0, vs61, vs1 @@ -714,19 +714,19 @@ define <4 x float> @test_extend32_vec4(ptr %p) #0 { ; SOFT-NEXT: std r0, 96(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: lhz r3, 2(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: lhz r3, 4(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: lhz r3, 6(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r6, r3 ; SOFT-NEXT: mr r3, r29 @@ -759,19 +759,19 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 { ; P8-NEXT: stxvd2x vs62, r1, r4 # 16-byte Folded Spill ; P8-NEXT: li r4, 80 ; P8-NEXT: stxvd2x vs63, r1, r4 # 16-byte Folded Spill -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 4(r30) ; P8-NEXT: xxlor vs63, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 2(r30) ; P8-NEXT: xxlor vs62, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: lhz r3, 0(r30) ; P8-NEXT: xxlor vs61, f1, f1 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: li r3, 80 ; P8-NEXT: xxmrghd vs35, vs63, vs62 @@ -816,25 +816,25 @@ define <4 x double> @test_extend64_vec4(ptr %p) #0 { ; SOFT-NEXT: std r0, 96(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r3) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: lhz r3, 2(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: lhz r3, 4(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: lhz r3, 6(r30) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: bl __extendsfdf2 ; SOFT-NEXT: nop @@ -870,21 +870,21 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; P8-NEXT: stxvd2x vs63, r1, r3 # 16-byte Folded Spill ; P8-NEXT: mr r30, r5 ; P8-NEXT: vmr v31, v2 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xxswapd vs0, vs63 ; P8-NEXT: mr r29, r3 ; P8-NEXT: xscvspdpn f1, vs0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xxsldwi vs0, vs63, vs63, 1 ; P8-NEXT: mr r28, r3 ; P8-NEXT: xscvspdpn f1, vs0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: xscvspdpn f1, vs63 ; P8-NEXT: mr r27, r3 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: sth r3, 6(r30) ; P8-NEXT: li r3, 48 @@ -939,48 +939,48 @@ define void @test_trunc32_vec4(<4 x float> %a, ptr %p) #0 { ; SOFT-NEXT: mr r30, r7 ; SOFT-NEXT: mr r29, r5 ; SOFT-NEXT: mr r28, r4 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r26, r3 ; SOFT-NEXT: clrldi r3, r29, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r28, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r27, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: clrldi r3, r28, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r29, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r26, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 6(r30) ; SOFT-NEXT: mr r3, r29 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 4(r30) ; SOFT-NEXT: mr r3, r28 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 2(r30) ; SOFT-NEXT: mr r3, r27 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 80 @@ -1093,33 +1093,33 @@ define void @test_trunc64_vec4(<4 x double> %a, ptr %p) #0 { ; SOFT-NEXT: bl __truncdfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r27, r3 ; SOFT-NEXT: clrldi r3, r28, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r28, r3 ; SOFT-NEXT: clrldi r3, r29, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: clrldi r3, r26, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 6(r30) ; SOFT-NEXT: mr r3, r29 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 4(r30) ; SOFT-NEXT: mr r3, r28 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 2(r30) ; SOFT-NEXT: mr r3, r27 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: sth r3, 0(r30) ; SOFT-NEXT: addi r1, r1, 80 @@ -1145,15 +1145,15 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; P8-NEXT: std r0, 80(r1) ; P8-NEXT: mr r30, r3 ; P8-NEXT: lhz r3, 0(r4) -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: mtfprwa f0, r30 ; P8-NEXT: fmr f31, f1 ; P8-NEXT: xscvsxdsp f1, f0 -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: clrldi r3, r3, 48 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: xsaddsp f1, f31, f1 ; P8-NEXT: addi r1, r1, 64 @@ -1187,17 +1187,17 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; SOFT-NEXT: std r0, 80(r1) ; SOFT-NEXT: mr r30, r3 ; SOFT-NEXT: lhz r3, 0(r4) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r29, r3 ; SOFT-NEXT: extsw r3, r30 ; SOFT-NEXT: bl __floatsisf ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 32 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: clrldi r3, r3, 48 -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: mr r4, r3 ; SOFT-NEXT: mr r3, r29 @@ -1221,10 +1221,10 @@ define half @PR40273(half) #0 { ; P8-NEXT: mflr r0 ; P8-NEXT: stdu r1, -32(r1) ; P8-NEXT: std r0, 48(r1) -; P8-NEXT: bl __gnu_f2h_ieee +; P8-NEXT: bl __truncsfhf2 ; P8-NEXT: nop ; P8-NEXT: clrldi r3, r3, 48 -; P8-NEXT: bl __gnu_h2f_ieee +; P8-NEXT: bl __extendhfsf2 ; P8-NEXT: nop ; P8-NEXT: fmr f0, f1 ; P8-NEXT: xxlxor f1, f1, f1 @@ -1260,7 +1260,7 @@ define half @PR40273(half) #0 { ; SOFT-NEXT: stdu r1, -32(r1) ; SOFT-NEXT: clrldi r3, r3, 48 ; SOFT-NEXT: std r0, 48(r1) -; SOFT-NEXT: bl __gnu_h2f_ieee +; SOFT-NEXT: bl __extendhfsf2 ; SOFT-NEXT: nop ; SOFT-NEXT: li r4, 0 ; SOFT-NEXT: bl __nesf2 @@ -1268,7 +1268,7 @@ define half @PR40273(half) #0 { ; SOFT-NEXT: cmplwi r3, 0 ; SOFT-NEXT: lis r3, 16256 ; SOFT-NEXT: iseleq r3, 0, r3 -; SOFT-NEXT: bl __gnu_f2h_ieee +; SOFT-NEXT: bl __truncsfhf2 ; SOFT-NEXT: nop ; SOFT-NEXT: addi r1, r1, 32 ; SOFT-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/inc-of-add.ll b/llvm/test/CodeGen/PowerPC/inc-of-add.ll index 432b5a6b362fe..98b812e7845a5 100644 --- a/llvm/test/CodeGen/PowerPC/inc-of-add.ll +++ b/llvm/test/CodeGen/PowerPC/inc-of-add.ll @@ -412,8 +412,8 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; PPC32-NEXT: not 4, 4 ; PPC32-NEXT: not 3, 3 ; PPC32-NEXT: subc 4, 8, 4 -; PPC32-NEXT: subfe 3, 3, 7 ; PPC32-NEXT: not 6, 6 +; PPC32-NEXT: subfe 3, 3, 7 ; PPC32-NEXT: not 5, 5 ; PPC32-NEXT: subc 6, 10, 6 ; PPC32-NEXT: subfe 5, 5, 9 diff --git a/llvm/test/CodeGen/PowerPC/pr35688.ll b/llvm/test/CodeGen/PowerPC/pr35688.ll index 5746934802eb2..8a4351b229fd1 100644 --- a/llvm/test/CodeGen/PowerPC/pr35688.ll +++ b/llvm/test/CodeGen/PowerPC/pr35688.ll @@ -8,9 +8,10 @@ define void @ec_GFp_nistp256_points_mul() { ; CHECK-LABEL: ec_GFp_nistp256_points_mul: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: ld 3, 0(3) -; CHECK-NEXT: subfic 4, 3, 0 ; CHECK-NEXT: li 4, 0 +; CHECK-NEXT: subfic 5, 3, 0 ; CHECK-NEXT: subfze 5, 4 +; CHECK-NEXT: sradi 5, 5, 63 ; CHECK-NEXT: subc 3, 5, 3 ; CHECK-NEXT: subfe 3, 4, 5 ; CHECK-NEXT: sradi 3, 3, 63 diff --git a/llvm/test/CodeGen/PowerPC/pr36292.ll b/llvm/test/CodeGen/PowerPC/pr36292.ll index 98d94646bce65..1794b3ba526ed 100644 --- a/llvm/test/CodeGen/PowerPC/pr36292.ll +++ b/llvm/test/CodeGen/PowerPC/pr36292.ll @@ -12,12 +12,11 @@ define void @test() nounwind comdat { ; CHECK-NEXT: std 30, -16(1) # 8-byte Folded Spill ; CHECK-NEXT: stdu 1, -64(1) ; CHECK-NEXT: std 0, 80(1) -; CHECK-NEXT: li 4, 0 ; CHECK-NEXT: ld 3, 0(3) ; CHECK-NEXT: ld 30, 32(1) -; CHECK-NEXT: subc 3, 3, 30 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: iseleq 3, 0, 3 +; CHECK-NEXT: sub 4, 3, 30 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: iselgt 3, 0, 4 ; CHECK-NEXT: addi 29, 3, 1 ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: .LBB0_1: # %forcond diff --git a/llvm/test/CodeGen/PowerPC/pr40922.ll b/llvm/test/CodeGen/PowerPC/pr40922.ll index ed840ad12b7ed..9252e9a3e3aa4 100644 --- a/llvm/test/CodeGen/PowerPC/pr40922.ll +++ b/llvm/test/CodeGen/PowerPC/pr40922.ll @@ -23,10 +23,11 @@ define i32 @a() { ; CHECK-NEXT: li 5, 0 ; CHECK-NEXT: mr 30, 3 ; CHECK-NEXT: addic 6, 4, 6 -; CHECK-NEXT: addze. 5, 5 -; CHECK-NEXT: rlwinm 5, 6, 0, 28, 26 -; CHECK-NEXT: cmplw 1, 5, 4 -; CHECK-NEXT: crnand 20, 4, 2 +; CHECK-NEXT: addze 5, 5 +; CHECK-NEXT: rlwinm 6, 6, 0, 28, 26 +; CHECK-NEXT: andi. 5, 5, 1 +; CHECK-NEXT: cmplw 1, 6, 4 +; CHECK-NEXT: crorc 20, 1, 4 ; CHECK-NEXT: bc 12, 20, .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: bl e diff --git a/llvm/test/CodeGen/PowerPC/pr45448.ll b/llvm/test/CodeGen/PowerPC/pr45448.ll index 0edbae47e9378..0f2dcb3ccc8a0 100644 --- a/llvm/test/CodeGen/PowerPC/pr45448.ll +++ b/llvm/test/CodeGen/PowerPC/pr45448.ll @@ -22,14 +22,12 @@ define hidden void @julia_tryparse_internal_45896() #0 { ; CHECK-NEXT: li r5, -3 ; CHECK-NEXT: sradi r4, r3, 63 ; CHECK-NEXT: rldic r5, r5, 4, 32 -; CHECK-NEXT: mulld r6, r4, r5 ; CHECK-NEXT: mulhdu r3, r3, r5 -; CHECK-NEXT: mulhdu r4, r4, r5 -; CHECK-NEXT: addc r3, r3, r6 -; CHECK-NEXT: li r3, 0 -; CHECK-NEXT: addze r3, r3 -; CHECK-NEXT: or. r3, r4, r3 -; CHECK-NEXT: beq cr0, .LBB0_9 +; CHECK-NEXT: maddld r6, r4, r5, r3 +; CHECK-NEXT: cmpld cr1, r6, r3 +; CHECK-NEXT: mulhdu. r3, r4, r5 +; CHECK-NEXT: crorc 4*cr5+lt, 4*cr1+lt, eq +; CHECK-NEXT: bc 4, 4*cr5+lt, .LBB0_9 ; CHECK-NEXT: # %bb.8: # %L917 ; CHECK-NEXT: .LBB0_9: # %L994 top: diff --git a/llvm/test/CodeGen/PowerPC/pr48519.ll b/llvm/test/CodeGen/PowerPC/pr48519.ll index 002dd8f0d167a..fa156454a1313 100644 --- a/llvm/test/CodeGen/PowerPC/pr48519.ll +++ b/llvm/test/CodeGen/PowerPC/pr48519.ll @@ -20,17 +20,17 @@ define void @julia__typed_vcat_20() #0 { ; CHECK-NEXT: addi r3, r3, -1 ; CHECK-NEXT: mtfprd f0, r3 ; CHECK-NEXT: xscvsxdsp f1, f0 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r30, r30, -1 ; CHECK-NEXT: li r3, 0 ; CHECK-NEXT: cmpldi r30, 0 ; CHECK-NEXT: bc 12, gt, .LBB0_1 ; CHECK-NEXT: # %bb.2: # %bb11 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: sth r3, 0(r3) ; @@ -95,7 +95,7 @@ define void @julia__hypot_17() #0 { ; CHECK-NEXT: # %bb.2: # %bb3 ; CHECK-NEXT: # ; CHECK-NEXT: lhz r3, 0(0) -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fcmpu cr0, f1, f1 ; CHECK-NEXT: bun cr0, .LBB1_1 @@ -169,12 +169,12 @@ define void @func_48786() #0 { ; CHECK-NEXT: # %bb.3: # %bb4 ; CHECK-NEXT: # ; CHECK-NEXT: lhz r3, 0(r3) -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bc 4, 4*cr2+lt, .LBB2_6 ; CHECK-NEXT: # %bb.4: # %bb8 ; CHECK-NEXT: # -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: sth r3, 0(0) ; CHECK-NEXT: b .LBB2_1 @@ -273,7 +273,7 @@ define void @func_48785(half %arg) #0 { ; CHECK-NEXT: .LBB3_1: # %bb1 ; CHECK-NEXT: # ; CHECK-NEXT: fmr f1, f31 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r30, r30, -1 ; CHECK-NEXT: sth r3, 0(r29) diff --git a/llvm/test/CodeGen/PowerPC/pr49092.ll b/llvm/test/CodeGen/PowerPC/pr49092.ll index ea84c77603d08..7b524a6d2f69b 100644 --- a/llvm/test/CodeGen/PowerPC/pr49092.ll +++ b/llvm/test/CodeGen/PowerPC/pr49092.ll @@ -14,7 +14,7 @@ define dso_local half @test2(i64 %a, i64 %b) local_unnamed_addr #0 { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: addi r3, r3, 11 ; CHECK-NEXT: clrlwi r3, r3, 16 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: addi r1, r1, 32 ; CHECK-NEXT: ld r0, 16(r1) diff --git a/llvm/test/CodeGen/PowerPC/sat-add.ll b/llvm/test/CodeGen/PowerPC/sat-add.ll index d9b22bda85e44..8fff2c28da245 100644 --- a/llvm/test/CodeGen/PowerPC/sat-add.ll +++ b/llvm/test/CodeGen/PowerPC/sat-add.ll @@ -156,11 +156,10 @@ define i64 @unsigned_sat_constant_i64_using_min(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addic 3, 3, 42 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: addi 4, 3, 42 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, %a @@ -171,11 +170,10 @@ define i64 @unsigned_sat_constant_i64_using_cmp_sum(i64 %x) { define i64 @unsigned_sat_constant_i64_using_cmp_notval(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_using_cmp_notval: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addic 3, 3, 42 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: addi 4, 3, 42 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, 42 %c = icmp ugt i64 %x, -43 @@ -348,11 +346,10 @@ define i64 @unsigned_sat_variable_i64_using_min(i64 %x, i64 %y) { define i64 @unsigned_sat_variable_i64_using_cmp_sum(i64 %x, i64 %y) { ; CHECK-LABEL: unsigned_sat_variable_i64_using_cmp_sum: ; CHECK: # %bb.0: -; CHECK-NEXT: addc 3, 3, 4 -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: li 4, -1 -; CHECK-NEXT: iseleq 3, 3, 4 +; CHECK-NEXT: add 4, 3, 4 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: li 3, -1 +; CHECK-NEXT: isellt 3, 3, 4 ; CHECK-NEXT: blr %a = add i64 %x, %y %c = icmp ugt i64 %x, %a @@ -862,11 +859,9 @@ define <4 x i128> @sadd(<4 x i128> %a, <4 x i128> %b) local_unnamed_addr { define i64 @unsigned_sat_constant_i64_with_single_use(i64 %x) { ; CHECK-LABEL: unsigned_sat_constant_i64_with_single_use: ; CHECK: # %bb.0: -; CHECK-NEXT: li 4, 4 -; CHECK-NEXT: subc 3, 3, 4 -; CHECK-NEXT: li 4, 0 -; CHECK-NEXT: addze. 4, 4 -; CHECK-NEXT: iseleq 3, 0, 3 +; CHECK-NEXT: addi 4, 3, -4 +; CHECK-NEXT: cmpld 4, 3 +; CHECK-NEXT: iselgt 3, 0, 4 ; CHECK-NEXT: blr %umin = call i64 @llvm.umin.i64(i64 %x, i64 4) %sub = sub i64 %x, %umin diff --git a/llvm/test/CodeGen/PowerPC/select.ll b/llvm/test/CodeGen/PowerPC/select.ll index 10661030da8d8..289f83c475ff3 100644 --- a/llvm/test/CodeGen/PowerPC/select.ll +++ b/llvm/test/CodeGen/PowerPC/select.ll @@ -135,22 +135,18 @@ define i64 @f4_sge_0(i64 %x) { ; ; CHECK-32-LABEL: f4_sge_0: ; CHECK-32: # %bb.0: -; CHECK-32-NEXT: mr r6, r4 +; CHECK-32-NEXT: mr r5, r4 ; CHECK-32-NEXT: subfic r4, r4, 0 +; CHECK-32-NEXT: mr r6, r3 ; CHECK-32-NEXT: cmpwi r3, -1 -; CHECK-32-NEXT: subfze r5, r3 -; CHECK-32-NEXT: ble cr0, .LBB5_3 +; CHECK-32-NEXT: subfze r3, r3 +; CHECK-32-NEXT: bgt cr0, .LBB5_2 ; CHECK-32-NEXT: # %bb.1: -; CHECK-32-NEXT: ble cr0, .LBB5_4 +; CHECK-32-NEXT: mr r3, r6 ; CHECK-32-NEXT: .LBB5_2: -; CHECK-32-NEXT: mr r3, r5 -; CHECK-32-NEXT: blr -; CHECK-32-NEXT: .LBB5_3: -; CHECK-32-NEXT: mr r4, r6 -; CHECK-32-NEXT: bgt cr0, .LBB5_2 -; CHECK-32-NEXT: .LBB5_4: -; CHECK-32-NEXT: mr r5, r3 -; CHECK-32-NEXT: mr r3, r5 +; CHECK-32-NEXT: bgtlr cr0 +; CHECK-32-NEXT: # %bb.3: +; CHECK-32-NEXT: mr r4, r5 ; CHECK-32-NEXT: blr %c = icmp sge i64 %x, 0 %x.neg = sub i64 0, %x diff --git a/llvm/test/CodeGen/PowerPC/uaddo-32.ll b/llvm/test/CodeGen/PowerPC/uaddo-32.ll index 5dd5a2672b166..b5989fc2ee2da 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-32.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-32.ll @@ -1,24 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM -; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM +; RUN: llc < %s -mtriple=powerpc-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc-ibm-aix-xcoff | FileCheck %s define noundef i32 @add(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 6 -; LINUXASM-NEXT: stw 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: stw 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 @@ -31,22 +22,13 @@ entry: declare { i32, i1 } @llvm.uadd.with.overflow.i32(i32, i32) define noundef zeroext i1 @add_overflow(i32 noundef %a, i32 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add_overflow: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 4, 3, 4 -; LINUXASM-NEXT: addze 3, 6 -; LINUXASM-NEXT: stw 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add_overflow: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 4, 3, 4 -; AIXASM-NEXT: li 3, 0 -; AIXASM-NEXT: addze 3, 3 -; AIXASM-NEXT: stw 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: stw 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b) %1 = extractvalue { i32, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/uaddo-64.ll b/llvm/test/CodeGen/PowerPC/uaddo-64.ll index 98e834f29467c..3c7ab2c2bab79 100644 --- a/llvm/test/CodeGen/PowerPC/uaddo-64.ll +++ b/llvm/test/CodeGen/PowerPC/uaddo-64.ll @@ -1,24 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s --check-prefix=LINUXASM -; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s --check-prefix=AIXASM +; RUN: llc < %s -mcpu=ppc -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=powerpc64-ibm-aix-xcoff | FileCheck %s define noundef i64 @add(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 6 -; LINUXASM-NEXT: std 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: std 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -31,22 +22,13 @@ entry: declare { i64, i1 } @llvm.uadd.with.overflow.i64(i64, i64) define noundef zeroext i1 @add_overflow(i64 noundef %a, i64 noundef %b, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: add_overflow: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 6, 0 -; LINUXASM-NEXT: addc 4, 3, 4 -; LINUXASM-NEXT: addze 3, 6 -; LINUXASM-NEXT: std 4, 0(5) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .add_overflow: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 4, 3, 4 -; AIXASM-NEXT: li 3, 0 -; AIXASM-NEXT: addze 3, 3 -; AIXASM-NEXT: std 4, 0(5) -; AIXASM-NEXT: blr - +; CHECK-LABEL: add_overflow: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 6, 0 +; CHECK-NEXT: addc 4, 3, 4 +; CHECK-NEXT: addze 3, 6 +; CHECK-NEXT: std 4, 0(5) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 @@ -56,28 +38,16 @@ entry: } define noundef i64 @addWithCarryIn (i64 noundef %a, i64 noundef %b, i64 noundef %c, ptr nocapture noundef writeonly %ovf) { -; LINUXASM-LABEL: addWithCarryIn: -; LINUXASM: # %bb.0: # %entry -; LINUXASM-NEXT: li 7, 0 -; LINUXASM-NEXT: addc 3, 3, 4 -; LINUXASM-NEXT: addze 4, 7 -; LINUXASM-NEXT: addc 3, 3, 5 -; LINUXASM-NEXT: addze 5, 7 -; LINUXASM-NEXT: or 4, 4, 5 -; LINUXASM-NEXT: std 4, 0(6) -; LINUXASM-NEXT: blr - -; AIXASM-LABEL: .addWithCarryIn: -; AIXASM: # %bb.0: # %entry -; AIXASM-NEXT: addc 3, 3, 4 -; AIXASM-NEXT: li 4, 0 -; AIXASM-NEXT: addze 7, 4 -; AIXASM-NEXT: addc 3, 3, 5 -; AIXASM-NEXT: addze 4, 4 -; AIXASM-NEXT: or 4, 7, 4 -; AIXASM-NEXT: std 4, 0(6) -; AIXASM-NEXT: blr - +; CHECK-LABEL: addWithCarryIn: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: li 7, 0 +; CHECK-NEXT: addc 3, 3, 4 +; CHECK-NEXT: addze 4, 7 +; CHECK-NEXT: addc 3, 3, 5 +; CHECK-NEXT: addze 5, 7 +; CHECK-NEXT: or 4, 4, 5 +; CHECK-NEXT: std 4, 0(6) +; CHECK-NEXT: blr entry: %0 = tail call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) %1 = extractvalue { i64, i1 } %0, 1 diff --git a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll index f573fdab1b153..84895e74f18d5 100644 --- a/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll +++ b/llvm/test/CodeGen/PowerPC/umulo-128-legalisation-lowering.ll @@ -5,134 +5,137 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 { ; PPC64-LABEL: muloti_test: ; PPC64: # %bb.0: # %start -; PPC64-NEXT: addic 9, 5, -1 -; PPC64-NEXT: mulld 10, 5, 4 -; PPC64-NEXT: mulld 11, 3, 6 -; PPC64-NEXT: subfe 9, 9, 5 -; PPC64-NEXT: add 10, 11, 10 -; PPC64-NEXT: addic 11, 3, -1 -; PPC64-NEXT: mulhdu 8, 3, 6 -; PPC64-NEXT: subfe 3, 11, 3 -; PPC64-NEXT: and 3, 3, 9 -; PPC64-NEXT: addic 9, 8, -1 -; PPC64-NEXT: subfe 8, 9, 8 -; PPC64-NEXT: or 3, 3, 8 -; PPC64-NEXT: mulhdu 5, 5, 4 ; PPC64-NEXT: addic 8, 5, -1 +; PPC64-NEXT: mulhdu 9, 5, 4 +; PPC64-NEXT: mulld 10, 5, 4 ; PPC64-NEXT: subfe 5, 8, 5 -; PPC64-NEXT: li 7, 0 -; PPC64-NEXT: or 5, 3, 5 -; PPC64-NEXT: mulhdu 8, 4, 6 -; PPC64-NEXT: addc 3, 8, 10 -; PPC64-NEXT: addze 7, 7 -; PPC64-NEXT: addic 8, 7, -1 -; PPC64-NEXT: subfe 7, 8, 7 +; PPC64-NEXT: mulld 8, 3, 6 +; PPC64-NEXT: add 8, 8, 10 +; PPC64-NEXT: addic 10, 3, -1 +; PPC64-NEXT: mulhdu 7, 3, 6 +; PPC64-NEXT: subfe 3, 10, 3 +; PPC64-NEXT: and 5, 3, 5 +; PPC64-NEXT: addic 3, 7, -1 +; PPC64-NEXT: subfe 7, 3, 7 +; PPC64-NEXT: or 5, 5, 7 +; PPC64-NEXT: mulhdu 10, 4, 6 +; PPC64-NEXT: addic 7, 9, -1 +; PPC64-NEXT: add 3, 10, 8 +; PPC64-NEXT: subfe 7, 7, 9 +; PPC64-NEXT: or 5, 5, 7 +; PPC64-NEXT: subc 7, 3, 10 +; PPC64-NEXT: subfe 7, 3, 3 +; PPC64-NEXT: neg 7, 7 ; PPC64-NEXT: or 5, 5, 7 ; PPC64-NEXT: mulld 4, 4, 6 ; PPC64-NEXT: blr ; ; PPC32-LABEL: muloti_test: ; PPC32: # %bb.0: # %start -; PPC32-NEXT: stwu 1, -64(1) -; PPC32-NEXT: stw 26, 40(1) # 4-byte Folded Spill -; PPC32-NEXT: mfcr 12 -; PPC32-NEXT: stw 27, 44(1) # 4-byte Folded Spill -; PPC32-NEXT: mullw 27, 9, 4 -; PPC32-NEXT: stw 21, 20(1) # 4-byte Folded Spill +; PPC32-NEXT: stwu 1, -80(1) ; PPC32-NEXT: mr 11, 7 -; PPC32-NEXT: stw 22, 24(1) # 4-byte Folded Spill -; PPC32-NEXT: li 7, 0 -; PPC32-NEXT: mullw 26, 3, 10 -; PPC32-NEXT: stw 23, 28(1) # 4-byte Folded Spill -; PPC32-NEXT: add 27, 26, 27 -; PPC32-NEXT: stw 24, 32(1) # 4-byte Folded Spill -; PPC32-NEXT: cmpwi 7, 11, 0 -; PPC32-NEXT: stw 25, 36(1) # 4-byte Folded Spill -; PPC32-NEXT: mullw 24, 11, 6 -; PPC32-NEXT: stw 28, 48(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 29, 52(1) # 4-byte Folded Spill -; PPC32-NEXT: stw 30, 56(1) # 4-byte Folded Spill -; PPC32-NEXT: mulhwu 0, 8, 6 -; PPC32-NEXT: stw 12, 16(1) -; PPC32-NEXT: mr 12, 5 -; PPC32-NEXT: mulhwu 5, 4, 10 -; PPC32-NEXT: addc 5, 5, 27 -; PPC32-NEXT: addze 27, 7 -; PPC32-NEXT: cmpwi 2, 27, 0 -; PPC32-NEXT: mullw 25, 12, 8 -; PPC32-NEXT: add 26, 24, 25 -; PPC32-NEXT: addc 0, 0, 26 -; PPC32-NEXT: addze 26, 7 -; PPC32-NEXT: mullw 23, 8, 6 -; PPC32-NEXT: mullw 22, 4, 10 -; PPC32-NEXT: addc 24, 22, 23 -; PPC32-NEXT: adde 22, 5, 0 -; PPC32-NEXT: mulhwu 29, 6, 10 -; PPC32-NEXT: mullw 21, 12, 10 -; PPC32-NEXT: addc 5, 21, 29 -; PPC32-NEXT: mulhwu 30, 12, 10 -; PPC32-NEXT: addze 0, 30 -; PPC32-NEXT: mullw 23, 6, 9 -; PPC32-NEXT: addc 5, 23, 5 -; PPC32-NEXT: mulhwu 28, 6, 9 -; PPC32-NEXT: addze 29, 28 -; PPC32-NEXT: addc 0, 0, 29 -; PPC32-NEXT: addze 29, 7 -; PPC32-NEXT: mullw 30, 12, 9 -; PPC32-NEXT: addc 0, 30, 0 -; PPC32-NEXT: mulhwu 25, 12, 9 -; PPC32-NEXT: adde 30, 25, 29 -; PPC32-NEXT: addc 0, 0, 24 -; PPC32-NEXT: adde 30, 30, 22 -; PPC32-NEXT: addze. 29, 7 +; PPC32-NEXT: stw 26, 56(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu. 26, 11, 6 +; PPC32-NEXT: stw 24, 48(1) # 4-byte Folded Spill +; PPC32-NEXT: mfcr 12 +; PPC32-NEXT: stw 27, 60(1) # 4-byte Folded Spill ; PPC32-NEXT: mcrf 1, 0 -; PPC32-NEXT: mulhwu. 29, 11, 6 -; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: mulhwu. 29, 12, 8 +; PPC32-NEXT: stw 19, 28(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu 27, 6, 10 +; PPC32-NEXT: stw 20, 32(1) # 4-byte Folded Spill +; PPC32-NEXT: cmpwi 6, 11, 0 +; PPC32-NEXT: stw 21, 36(1) # 4-byte Folded Spill +; PPC32-NEXT: li 7, 0 +; PPC32-NEXT: stw 22, 40(1) # 4-byte Folded Spill +; PPC32-NEXT: mulhwu. 26, 5, 8 +; PPC32-NEXT: stw 23, 44(1) # 4-byte Folded Spill ; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: cmpwi 12, 0 -; PPC32-NEXT: crnor 20, 2, 30 +; PPC32-NEXT: stw 25, 52(1) # 4-byte Folded Spill +; PPC32-NEXT: cmpwi 5, 0 +; PPC32-NEXT: stw 28, 64(1) # 4-byte Folded Spill +; PPC32-NEXT: mullw 24, 5, 10 +; PPC32-NEXT: stw 29, 68(1) # 4-byte Folded Spill +; PPC32-NEXT: crnor 20, 2, 26 +; PPC32-NEXT: stw 30, 72(1) # 4-byte Folded Spill ; PPC32-NEXT: cmpwi 3, 0 -; PPC32-NEXT: cmpwi 7, 9, 0 -; PPC32-NEXT: crnor 24, 30, 2 -; PPC32-NEXT: mulhwu. 12, 3, 10 -; PPC32-NEXT: crorc 20, 20, 26 -; PPC32-NEXT: mcrf 7, 0 +; PPC32-NEXT: stw 12, 24(1) +; PPC32-NEXT: mulhwu 30, 5, 10 +; PPC32-NEXT: cmpwi 6, 9, 0 +; PPC32-NEXT: crnor 21, 26, 2 +; PPC32-NEXT: crorc 20, 20, 6 ; PPC32-NEXT: crorc 20, 20, 22 -; PPC32-NEXT: cmpwi 26, 0 -; PPC32-NEXT: crorc 28, 20, 2 +; PPC32-NEXT: mulhwu 12, 5, 9 +; PPC32-NEXT: mullw 26, 5, 9 +; PPC32-NEXT: mullw 22, 5, 8 +; PPC32-NEXT: addc 5, 24, 27 +; PPC32-NEXT: addze 30, 30 +; PPC32-NEXT: mullw 23, 6, 9 +; PPC32-NEXT: addc 5, 23, 5 +; PPC32-NEXT: mullw 21, 11, 6 +; PPC32-NEXT: add 27, 21, 22 +; PPC32-NEXT: mulhwu 28, 8, 6 +; PPC32-NEXT: add 27, 28, 27 +; PPC32-NEXT: cmplw 7, 27, 28 +; PPC32-NEXT: mulhwu. 23, 3, 10 +; PPC32-NEXT: mcrf 6, 0 +; PPC32-NEXT: cror 24, 20, 28 +; PPC32-NEXT: crorc 25, 21, 26 +; PPC32-NEXT: mulhwu 0, 6, 9 +; PPC32-NEXT: mullw 20, 9, 4 ; PPC32-NEXT: mulhwu. 9, 9, 4 -; PPC32-NEXT: mcrf 5, 0 -; PPC32-NEXT: crorc 20, 24, 30 +; PPC32-NEXT: mcrf 1, 0 +; PPC32-NEXT: addze 9, 0 +; PPC32-NEXT: mullw 19, 3, 10 ; PPC32-NEXT: or. 3, 4, 3 -; PPC32-NEXT: mcrf 6, 0 -; PPC32-NEXT: crorc 20, 20, 22 -; PPC32-NEXT: or. 3, 8, 11 -; PPC32-NEXT: crorc 20, 20, 10 -; PPC32-NEXT: crnor 21, 2, 26 +; PPC32-NEXT: mcrf 5, 0 +; PPC32-NEXT: addc 3, 30, 9 +; PPC32-NEXT: add 24, 19, 20 +; PPC32-NEXT: mulhwu 29, 4, 10 +; PPC32-NEXT: add 28, 29, 24 +; PPC32-NEXT: cmplw 2, 28, 29 +; PPC32-NEXT: crorc 20, 25, 6 +; PPC32-NEXT: cror 20, 20, 8 +; PPC32-NEXT: mullw 22, 4, 10 +; PPC32-NEXT: or. 4, 8, 11 +; PPC32-NEXT: addze 4, 7 +; PPC32-NEXT: crnor 21, 2, 22 ; PPC32-NEXT: cror 20, 21, 20 -; PPC32-NEXT: cror 20, 20, 28 -; PPC32-NEXT: crandc 20, 6, 20 +; PPC32-NEXT: mullw 25, 8, 6 +; PPC32-NEXT: addc 8, 26, 3 +; PPC32-NEXT: adde 9, 12, 4 +; PPC32-NEXT: addc 3, 22, 25 +; PPC32-NEXT: adde 11, 28, 27 +; PPC32-NEXT: addc 4, 8, 3 +; PPC32-NEXT: adde 3, 9, 11 +; PPC32-NEXT: cmplw 1, 3, 9 +; PPC32-NEXT: cmplw 4, 8 +; PPC32-NEXT: crandc 22, 4, 6 ; PPC32-NEXT: mullw 6, 6, 10 -; PPC32-NEXT: bc 12, 20, .LBB0_2 +; PPC32-NEXT: bc 12, 22, .LBB0_3 ; PPC32-NEXT: # %bb.1: # %start +; PPC32-NEXT: crand 21, 6, 0 +; PPC32-NEXT: bc 12, 21, .LBB0_3 +; PPC32-NEXT: # %bb.2: # %start +; PPC32-NEXT: cror 20, 20, 24 +; PPC32-NEXT: bc 4, 20, .LBB0_4 +; PPC32-NEXT: .LBB0_3: # %start ; PPC32-NEXT: li 7, 1 -; PPC32-NEXT: .LBB0_2: # %start -; PPC32-NEXT: lwz 12, 16(1) -; PPC32-NEXT: mr 3, 30 -; PPC32-NEXT: mr 4, 0 -; PPC32-NEXT: lwz 30, 56(1) # 4-byte Folded Reload +; PPC32-NEXT: .LBB0_4: # %start +; PPC32-NEXT: lwz 12, 24(1) +; PPC32-NEXT: lwz 30, 72(1) # 4-byte Folded Reload ; PPC32-NEXT: mtcrf 32, 12 # cr2 -; PPC32-NEXT: lwz 29, 52(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 28, 48(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 27, 44(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 26, 40(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 25, 36(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 24, 32(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 23, 28(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 22, 24(1) # 4-byte Folded Reload -; PPC32-NEXT: lwz 21, 20(1) # 4-byte Folded Reload -; PPC32-NEXT: addi 1, 1, 64 +; PPC32-NEXT: lwz 29, 68(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 28, 64(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 27, 60(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 26, 56(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 25, 52(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 24, 48(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 23, 44(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 22, 40(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 21, 36(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 20, 32(1) # 4-byte Folded Reload +; PPC32-NEXT: lwz 19, 28(1) # 4-byte Folded Reload +; PPC32-NEXT: addi 1, 1, 80 ; PPC32-NEXT: blr start: %0 = tail call { i128, i1 } @llvm.umul.with.overflow.i128(i128 %l, i128 %r) #2 diff --git a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll index 515dd0f70e948..e5c5356ce50a4 100644 --- a/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/PowerPC/urem-seteq-illegal-types.ll @@ -207,32 +207,33 @@ define i1 @test_urem_oversized(i66 %X) nounwind { ; PPC: # %bb.0: ; PPC-NEXT: lis 6, -12795 ; PPC-NEXT: ori 6, 6, 40665 -; PPC-NEXT: mulhwu 8, 5, 6 +; PPC-NEXT: mulhwu 7, 5, 6 ; PPC-NEXT: lis 9, 12057 ; PPC-NEXT: ori 9, 9, 37186 ; PPC-NEXT: mullw 11, 4, 6 -; PPC-NEXT: addc 8, 11, 8 +; PPC-NEXT: addc 7, 11, 7 ; PPC-NEXT: lis 11, -5526 ; PPC-NEXT: ori 11, 11, 61135 -; PPC-NEXT: mulhwu 7, 4, 6 -; PPC-NEXT: addze 7, 7 +; PPC-NEXT: mulhwu 8, 4, 6 +; PPC-NEXT: addze 8, 8 ; PPC-NEXT: mulhwu 10, 5, 9 ; PPC-NEXT: mullw 4, 4, 9 ; PPC-NEXT: mullw 9, 5, 9 -; PPC-NEXT: addc 8, 9, 8 -; PPC-NEXT: adde 7, 7, 10 -; PPC-NEXT: add 4, 4, 7 -; PPC-NEXT: rotlwi 9, 8, 31 +; PPC-NEXT: addc 7, 9, 7 +; PPC-NEXT: addze 9, 10 +; PPC-NEXT: rotlwi 10, 7, 31 ; PPC-NEXT: mullw 3, 3, 6 ; PPC-NEXT: mullw 6, 5, 6 ; PPC-NEXT: slwi 5, 5, 1 ; PPC-NEXT: add 3, 5, 3 ; PPC-NEXT: rotlwi 5, 6, 31 +; PPC-NEXT: rlwimi 5, 7, 31, 0, 0 +; PPC-NEXT: add 7, 8, 9 +; PPC-NEXT: add 4, 4, 7 ; PPC-NEXT: add 3, 4, 3 -; PPC-NEXT: rlwimi 5, 8, 31, 0, 0 -; PPC-NEXT: rlwimi 9, 3, 31, 0, 0 +; PPC-NEXT: rlwimi 10, 3, 31, 0, 0 ; PPC-NEXT: cmplw 5, 11 -; PPC-NEXT: cmplwi 1, 9, 13 +; PPC-NEXT: cmplwi 1, 10, 13 ; PPC-NEXT: rlwinm 3, 3, 31, 31, 31 ; PPC-NEXT: crandc 20, 4, 6 ; PPC-NEXT: crand 21, 6, 0 diff --git a/llvm/test/CodeGen/PowerPC/vector-llrint.ll b/llvm/test/CodeGen/PowerPC/vector-llrint.ll index 190cf6fe1eaad..9229fefced67e 100644 --- a/llvm/test/CodeGen/PowerPC/vector-llrint.ll +++ b/llvm/test/CodeGen/PowerPC/vector-llrint.ll @@ -17,10 +17,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; BE-NEXT: std r0, 128(r1) ; BE-NEXT: .cfi_def_cfa_offset 112 ; BE-NEXT: .cfi_offset lr, 16 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -36,10 +36,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -55,10 +55,10 @@ define <1 x i64> @llrint_v1i64_v1f16(<1 x half> %x) { ; FAST-NEXT: std r0, 48(r1) ; FAST-NEXT: .cfi_def_cfa_offset 32 ; FAST-NEXT: .cfi_offset lr, 16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: mffprd r3, f0 @@ -85,18 +85,18 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; BE-NEXT: fmr f31, f1 ; BE-NEXT: fmr f1, f2 ; BE-NEXT: std r30, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -129,18 +129,18 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; CHECK-NEXT: stfd f31, 88(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f2 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -172,17 +172,17 @@ define <2 x i64> @llrint_v1i64_v2f16(<2 x half> %x) { ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f2 ; FAST-NEXT: std r0, 64(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: fctid f1, f30 @@ -226,34 +226,34 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; BE-NEXT: stfd f31, 200(r1) # 8-byte Folded Spill ; BE-NEXT: fmr f31, f4 ; BE-NEXT: fmr f30, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -313,34 +313,34 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; CHECK-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f4 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -394,31 +394,31 @@ define <4 x i64> @llrint_v4i64_v4f16(<4 x half> %x) { ; FAST-NEXT: std r0, 80(r1) ; FAST-NEXT: fmr f31, f3 ; FAST-NEXT: fmr f30, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f30 ; FAST-NEXT: fctid f2, f31 @@ -491,66 +491,66 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; BE-NEXT: fmr f28, f5 ; BE-NEXT: fmr f27, f4 ; BE-NEXT: fmr f26, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -664,66 +664,66 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; CHECK-NEXT: stfd f31, 232(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f8 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -821,59 +821,59 @@ define <8 x i64> @llrint_v8i64_v8f16(<8 x half> %x) { ; FAST-NEXT: fmr f27, f4 ; FAST-NEXT: fmr f26, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f26 @@ -1001,130 +1001,130 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; BE-NEXT: fmr f23, f5 ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 652(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 668(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 660(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: fmr f24, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f23, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f22, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f21, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f20, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f19, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -1343,130 +1343,130 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill ; CHECK-NEXT: li r3, 160 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 568(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 576(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 584(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r16, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r17, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -1650,115 +1650,115 @@ define <16 x i64> @llrint_v16i64_v16f16(<16 x half> %x) { ; FAST-NEXT: fmr f22, f4 ; FAST-NEXT: fmr f23, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 304(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 296(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: fmr f1, f21 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f19 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: fmr f1, f18 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: fmr f1, f17 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: fmr f1, f20 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: fmr f1, f22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: fmr f1, f23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f23 @@ -1935,272 +1935,272 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: std r3, 304(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: std r3, 296(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: std r3, 280(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: std r3, 264(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: std r3, 248(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: std r3, 232(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: std r3, 216(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: std r3, 200(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: std r3, 184(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: std r3, 168(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: std r3, 152(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1028(r1) ; BE-NEXT: std r3, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1044(r1) ; BE-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1036(r1) ; BE-NEXT: mr r15, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1060(r1) ; BE-NEXT: mr r14, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1052(r1) ; BE-NEXT: mr r31, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1076(r1) ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1068(r1) ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1092(r1) ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1084(r1) ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1108(r1) ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1100(r1) ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1124(r1) ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1116(r1) ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1140(r1) ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1132(r1) ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1156(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1148(r1) ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1172(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1164(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: stfs f1, 316(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: stfs f1, 312(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: stfs f1, 292(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: stfs f1, 276(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: stfs f1, 260(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: stfs f1, 244(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: stfs f1, 228(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: stfs f1, 212(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: stfs f1, 196(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: stfs f1, 180(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: stfs f1, 164(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: stfs f1, 148(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: stfs f1, 132(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r31, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r14, 48 ; BE-NEXT: fmr f16, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r15, 48 ; BE-NEXT: fmr f15, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f14, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f31, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 136(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f30, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 152(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f29, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 168(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f28, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 184(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f27, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 200(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f26, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 216(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f25, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 232(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f24, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 248(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f23, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 264(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f22, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 280(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f21, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 296(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f20, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 304(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f19, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl llrintf ; BE-NEXT: nop @@ -2561,274 +2561,274 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r4 # 16-byte Folded Spill ; CHECK-NEXT: li r4, 384 ; CHECK-NEXT: stvx v31, r1, r4 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: std r3, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: std r3, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: std r3, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: std r3, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: std r3, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: std r3, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: std r3, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: std r3, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: std r3, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: std r3, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 832(r1) ; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 840(r1) ; CHECK-NEXT: std r3, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 848(r1) ; CHECK-NEXT: mr r15, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 856(r1) ; CHECK-NEXT: mr r14, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 864(r1) ; CHECK-NEXT: mr r31, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 872(r1) ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 880(r1) ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 888(r1) ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 896(r1) ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 904(r1) ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 912(r1) ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 920(r1) ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 928(r1) ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 936(r1) ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 944(r1) ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 952(r1) ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 960(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 968(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 976(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 204 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r16, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 200 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r17, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r31, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r14, 48 ; CHECK-NEXT: fmr f16, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r15, 48 ; CHECK-NEXT: fmr f15, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 48(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f14, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 56(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f30, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 64(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v30, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 72(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v29, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 80(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v28, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 88(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v27, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 96(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v26, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 104(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v25, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v24, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v23, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 128(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v22, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 144(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v21, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 160(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v20, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 176(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f31, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl llrintf ; CHECK-NEXT: nop @@ -3200,238 +3200,238 @@ define <32 x i64> @llrint_v32i64_v32f16(<32 x half> %x) { ; FAST-NEXT: xxlor v31, f6, f6 ; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill ; FAST-NEXT: lfs f1, 768(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 120 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 760(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 112 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 752(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 104 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 744(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 96 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 736(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 88 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 728(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 80 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 720(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 72 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 712(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 64 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 704(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 56 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 696(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 48 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 688(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v21, f1, f1 ; FAST-NEXT: lfs f1, 680(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v20, f1, f1 ; FAST-NEXT: lfs f1, 672(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v24, f1, f1 ; FAST-NEXT: lfs f1, 664(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 656(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 648(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: lfs f1, 640(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: lfs f1, 632(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: lfs f1, 624(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: xxlor f1, v25, v25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: xxlor f1, v26, v26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: xxlor f1, v27, v27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: xxlor f1, v28, v28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: xxlor f1, v29, v29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: xxlor f1, v30, v30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: xxlor f1, v31, v31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f14 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f14, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: xxlor f1, v22, v22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: xxlor f1, v23, v23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 44 ; FAST-NEXT: fmr f15, f1 ; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f3, f15 ; FAST-NEXT: fctid f4, f17 diff --git a/llvm/test/CodeGen/PowerPC/vector-lrint.ll b/llvm/test/CodeGen/PowerPC/vector-lrint.ll index b6d0bd5c05894..c2576d4631db8 100644 --- a/llvm/test/CodeGen/PowerPC/vector-lrint.ll +++ b/llvm/test/CodeGen/PowerPC/vector-lrint.ll @@ -28,10 +28,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; BE-NEXT: std r0, 128(r1) ; BE-NEXT: .cfi_def_cfa_offset 112 ; BE-NEXT: .cfi_offset lr, 16 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -47,10 +47,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; CHECK-NEXT: std r0, 48(r1) ; CHECK-NEXT: .cfi_def_cfa_offset 32 ; CHECK-NEXT: .cfi_offset lr, 16 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -66,10 +66,10 @@ define <1 x i64> @lrint_v1f16(<1 x half> %x) { ; FAST-NEXT: std r0, 48(r1) ; FAST-NEXT: .cfi_def_cfa_offset 32 ; FAST-NEXT: .cfi_offset lr, 16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: mffprd r3, f0 @@ -96,18 +96,18 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; BE-NEXT: fmr f31, f1 ; BE-NEXT: fmr f1, f2 ; BE-NEXT: std r30, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -140,18 +140,18 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; CHECK-NEXT: stfd f31, 88(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f2 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -183,17 +183,17 @@ define <2 x i64> @lrint_v2f16(<2 x half> %x) { ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f2 ; FAST-NEXT: std r0, 64(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f1 ; FAST-NEXT: fctid f1, f30 @@ -237,34 +237,34 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; BE-NEXT: stfd f31, 200(r1) # 8-byte Folded Spill ; BE-NEXT: fmr f31, f4 ; BE-NEXT: fmr f30, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -324,34 +324,34 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; CHECK-NEXT: stfd f31, 136(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f4 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -405,31 +405,31 @@ define <4 x i64> @lrint_v4f16(<4 x half> %x) { ; FAST-NEXT: std r0, 80(r1) ; FAST-NEXT: fmr f31, f3 ; FAST-NEXT: fmr f30, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f30 ; FAST-NEXT: fctid f2, f31 @@ -502,66 +502,66 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; BE-NEXT: fmr f28, f5 ; BE-NEXT: fmr f27, f4 ; BE-NEXT: fmr f26, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -675,66 +675,66 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; CHECK-NEXT: stfd f31, 232(r1) # 8-byte Folded Spill ; CHECK-NEXT: fmr f31, f8 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -832,59 +832,59 @@ define <8 x i64> @lrint_v8f16(<8 x half> %x) { ; FAST-NEXT: fmr f27, f4 ; FAST-NEXT: fmr f26, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: fmr f1, f30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f26 @@ -1012,130 +1012,130 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; BE-NEXT: fmr f23, f5 ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 652(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 668(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 660(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: fmr f31, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: fmr f30, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: fmr f29, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: fmr f28, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: fmr f27, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: fmr f26, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: fmr f25, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: fmr f24, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: fmr f23, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: fmr f22, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: fmr f21, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: fmr f20, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: fmr f19, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r30, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -1354,130 +1354,130 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r3 # 16-byte Folded Spill ; CHECK-NEXT: li r3, 160 ; CHECK-NEXT: stvx v31, r1, r3 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: mr r30, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 568(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 576(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 584(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r16, 48 ; CHECK-NEXT: fmr f31, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r17, 48 ; CHECK-NEXT: fmr f30, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r30, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -1661,115 +1661,115 @@ define <16 x i64> @lrint_v16i64_v16f16(<16 x half> %x) { ; FAST-NEXT: fmr f22, f4 ; FAST-NEXT: fmr f23, f3 ; FAST-NEXT: fmr f25, f2 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 304(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 296(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: fmr f1, f27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: fmr f1, f24 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: fmr f1, f21 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f19 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: fmr f1, f18 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: fmr f1, f17 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: fmr f1, f20 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: fmr f1, f22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: fmr f1, f23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: fmr f1, f25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: fmr f1, f26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f0, f25 ; FAST-NEXT: fctid f2, f23 @@ -1946,272 +1946,272 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; BE-NEXT: fmr f22, f4 ; BE-NEXT: fmr f21, f3 ; BE-NEXT: mr r30, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f20 ; BE-NEXT: std r3, 304(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f22 ; BE-NEXT: std r3, 296(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f21 ; BE-NEXT: std r3, 280(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f24 ; BE-NEXT: std r3, 264(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f23 ; BE-NEXT: std r3, 248(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f26 ; BE-NEXT: std r3, 232(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f25 ; BE-NEXT: std r3, 216(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f28 ; BE-NEXT: std r3, 200(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f27 ; BE-NEXT: std r3, 184(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f29 ; BE-NEXT: std r3, 168(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f30 ; BE-NEXT: std r3, 152(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1028(r1) ; BE-NEXT: std r3, 136(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: fmr f1, f31 ; BE-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1044(r1) ; BE-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1036(r1) ; BE-NEXT: mr r15, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1060(r1) ; BE-NEXT: mr r14, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1052(r1) ; BE-NEXT: mr r31, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1076(r1) ; BE-NEXT: mr r29, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1068(r1) ; BE-NEXT: mr r28, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1092(r1) ; BE-NEXT: mr r27, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1084(r1) ; BE-NEXT: mr r26, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1108(r1) ; BE-NEXT: mr r25, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1100(r1) ; BE-NEXT: mr r24, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1124(r1) ; BE-NEXT: mr r23, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1116(r1) ; BE-NEXT: mr r22, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1140(r1) ; BE-NEXT: mr r21, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1132(r1) ; BE-NEXT: mr r20, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1156(r1) ; BE-NEXT: mr r19, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1148(r1) ; BE-NEXT: mr r18, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1172(r1) ; BE-NEXT: mr r17, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: lfs f1, 1164(r1) ; BE-NEXT: mr r16, r3 -; BE-NEXT: bl __gnu_f2h_ieee +; BE-NEXT: bl __truncsfhf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r16, 48 ; BE-NEXT: stfs f1, 316(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r17, 48 ; BE-NEXT: stfs f1, 312(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r18, 48 ; BE-NEXT: stfs f1, 292(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r19, 48 ; BE-NEXT: stfs f1, 276(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r20, 48 ; BE-NEXT: stfs f1, 260(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r21, 48 ; BE-NEXT: stfs f1, 244(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r22, 48 ; BE-NEXT: stfs f1, 228(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r23, 48 ; BE-NEXT: stfs f1, 212(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r24, 48 ; BE-NEXT: stfs f1, 196(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r25, 48 ; BE-NEXT: stfs f1, 180(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r26, 48 ; BE-NEXT: stfs f1, 164(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r27, 48 ; BE-NEXT: stfs f1, 148(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r28, 48 ; BE-NEXT: stfs f1, 132(r1) # 4-byte Folded Spill -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r29, 48 ; BE-NEXT: fmr f18, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r31, 48 ; BE-NEXT: fmr f17, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r14, 48 ; BE-NEXT: fmr f16, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: clrldi r3, r15, 48 ; BE-NEXT: fmr f15, f1 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f14, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f31, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 136(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f30, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 152(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f29, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 168(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f28, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 184(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f27, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 200(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f26, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 216(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f25, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 232(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f24, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 248(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f23, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 264(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f22, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 280(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f21, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 296(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f20, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: ld r3, 304(r1) # 8-byte Folded Reload ; BE-NEXT: fmr f19, f1 ; BE-NEXT: clrldi r3, r3, 48 -; BE-NEXT: bl __gnu_h2f_ieee +; BE-NEXT: bl __extendhfsf2 ; BE-NEXT: nop ; BE-NEXT: bl lrintf ; BE-NEXT: nop @@ -2572,274 +2572,274 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; CHECK-NEXT: stvx v30, r1, r4 # 16-byte Folded Spill ; CHECK-NEXT: li r4, 384 ; CHECK-NEXT: stvx v31, r1, r4 # 16-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f20 ; CHECK-NEXT: std r3, 176(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f21 ; CHECK-NEXT: std r3, 160(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f22 ; CHECK-NEXT: std r3, 144(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f23 ; CHECK-NEXT: std r3, 128(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f24 ; CHECK-NEXT: std r3, 120(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f25 ; CHECK-NEXT: std r3, 112(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f26 ; CHECK-NEXT: std r3, 104(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f27 ; CHECK-NEXT: std r3, 96(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f28 ; CHECK-NEXT: std r3, 88(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f29 ; CHECK-NEXT: std r3, 80(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f30 ; CHECK-NEXT: std r3, 72(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: fmr f1, f31 ; CHECK-NEXT: std r3, 64(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 832(r1) ; CHECK-NEXT: std r3, 56(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 840(r1) ; CHECK-NEXT: std r3, 48(r1) # 8-byte Folded Spill -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 848(r1) ; CHECK-NEXT: mr r15, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 856(r1) ; CHECK-NEXT: mr r14, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 864(r1) ; CHECK-NEXT: mr r31, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 872(r1) ; CHECK-NEXT: mr r29, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 880(r1) ; CHECK-NEXT: mr r28, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 888(r1) ; CHECK-NEXT: mr r27, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 896(r1) ; CHECK-NEXT: mr r26, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 904(r1) ; CHECK-NEXT: mr r25, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 912(r1) ; CHECK-NEXT: mr r24, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 920(r1) ; CHECK-NEXT: mr r23, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 928(r1) ; CHECK-NEXT: mr r22, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 936(r1) ; CHECK-NEXT: mr r21, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 944(r1) ; CHECK-NEXT: mr r20, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 952(r1) ; CHECK-NEXT: mr r19, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 960(r1) ; CHECK-NEXT: mr r18, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 968(r1) ; CHECK-NEXT: mr r17, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: lfs f1, 976(r1) ; CHECK-NEXT: mr r16, r3 -; CHECK-NEXT: bl __gnu_f2h_ieee +; CHECK-NEXT: bl __truncsfhf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 204 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r16, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: li r3, 200 ; CHECK-NEXT: stxsspx f1, r1, r3 # 4-byte Folded Spill ; CHECK-NEXT: clrldi r3, r17, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r18, 48 ; CHECK-NEXT: fmr f29, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r19, 48 ; CHECK-NEXT: fmr f28, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r20, 48 ; CHECK-NEXT: fmr f27, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r21, 48 ; CHECK-NEXT: fmr f26, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r22, 48 ; CHECK-NEXT: fmr f25, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r23, 48 ; CHECK-NEXT: fmr f24, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r24, 48 ; CHECK-NEXT: fmr f23, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r25, 48 ; CHECK-NEXT: fmr f22, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r26, 48 ; CHECK-NEXT: fmr f21, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r27, 48 ; CHECK-NEXT: fmr f20, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r28, 48 ; CHECK-NEXT: fmr f19, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r29, 48 ; CHECK-NEXT: fmr f18, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r31, 48 ; CHECK-NEXT: fmr f17, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r14, 48 ; CHECK-NEXT: fmr f16, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: clrldi r3, r15, 48 ; CHECK-NEXT: fmr f15, f1 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 48(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f14, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 56(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f30, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 64(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v30, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 72(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v29, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 80(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v28, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 88(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v27, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 96(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v26, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 104(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v25, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 112(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v24, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 120(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v23, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 128(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v22, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 144(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v21, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 160(r1) # 8-byte Folded Reload ; CHECK-NEXT: xxlor v20, f1, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: ld r3, 176(r1) # 8-byte Folded Reload ; CHECK-NEXT: fmr f31, f1 ; CHECK-NEXT: clrldi r3, r3, 48 -; CHECK-NEXT: bl __gnu_h2f_ieee +; CHECK-NEXT: bl __extendhfsf2 ; CHECK-NEXT: nop ; CHECK-NEXT: bl lrintf ; CHECK-NEXT: nop @@ -3211,238 +3211,238 @@ define <32 x i64> @lrint_v32i64_v32f16(<32 x half> %x) { ; FAST-NEXT: xxlor v31, f6, f6 ; FAST-NEXT: stxsspx f1, r1, r4 # 4-byte Folded Spill ; FAST-NEXT: lfs f1, 768(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 120 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 760(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 112 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 752(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 104 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 744(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 96 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 736(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 88 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 728(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 80 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 720(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 72 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 712(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 64 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 704(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 56 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 696(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 48 ; FAST-NEXT: stxsdx f1, r1, r3 # 8-byte Folded Spill ; FAST-NEXT: lfs f1, 688(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v21, f1, f1 ; FAST-NEXT: lfs f1, 680(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v20, f1, f1 ; FAST-NEXT: lfs f1, 672(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: xxlor v24, f1, f1 ; FAST-NEXT: lfs f1, 664(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f31, f1 ; FAST-NEXT: lfs f1, 656(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f30, f1 ; FAST-NEXT: lfs f1, 648(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f28, f1 ; FAST-NEXT: lfs f1, 640(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f27, f1 ; FAST-NEXT: lfs f1, 632(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f26, f1 ; FAST-NEXT: lfs f1, 624(r1) -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f25, f1 ; FAST-NEXT: xxlor f1, v25, v25 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f24, f1 ; FAST-NEXT: xxlor f1, v26, v26 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f23, f1 ; FAST-NEXT: xxlor f1, v27, v27 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f22, f1 ; FAST-NEXT: xxlor f1, v28, v28 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f21, f1 ; FAST-NEXT: fmr f1, f29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f20, f1 ; FAST-NEXT: xxlor f1, v29, v29 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f19, f1 ; FAST-NEXT: xxlor f1, v30, v30 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f18, f1 ; FAST-NEXT: xxlor f1, v31, v31 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f29, f1 ; FAST-NEXT: fmr f1, f14 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f14, f1 ; FAST-NEXT: fmr f1, f16 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f16, f1 ; FAST-NEXT: xxlor f1, v22, v22 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fmr f17, f1 ; FAST-NEXT: xxlor f1, v23, v23 -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: li r3, 44 ; FAST-NEXT: fmr f15, f1 ; FAST-NEXT: lxsspx f1, r1, r3 # 4-byte Folded Reload -; FAST-NEXT: bl __gnu_f2h_ieee +; FAST-NEXT: bl __truncsfhf2 ; FAST-NEXT: nop ; FAST-NEXT: clrldi r3, r3, 48 -; FAST-NEXT: bl __gnu_h2f_ieee +; FAST-NEXT: bl __extendhfsf2 ; FAST-NEXT: nop ; FAST-NEXT: fctid f3, f15 ; FAST-NEXT: fctid f4, f17 diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll new file mode 100644 index 0000000000000..7c569da9291db --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-compute-known-bits.ll @@ -0,0 +1,18 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64 + +define i32 @test(<8 x i1> %mask) { +; CHECK-LABEL: test: +; CHECK: # %bb.0: +; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; CHECK-NEXT: vcpop.m a0, v0 +; CHECK-NEXT: ret + %1 = bitcast <8 x i1> %mask to i8 + %2 = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %1) + %3 = zext nneg i8 %2 to i32 + ret i32 %3 +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; RV32: {{.*}} +; RV64: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll new file mode 100644 index 0000000000000..16c4ade7fa9cb --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/vcpop-shl-zext-opt.ll @@ -0,0 +1,198 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=riscv32 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v | FileCheck %s --check-prefixes=CHECK,RV64 + +define dso_local void @test_store1(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef signext %c, i32 noundef signext %n) { +; RV32-LABEL: test_store1: +; RV32: # %bb.0: # %entry +; RV32-NEXT: blez a3, .LBB0_6 +; RV32-NEXT: # %bb.1: # %for.body.preheader +; RV32-NEXT: li a4, 8 +; RV32-NEXT: bltu a3, a4, .LBB0_7 +; RV32-NEXT: # %bb.2: # %for.body.preheader +; RV32-NEXT: sub a4, a0, a1 +; RV32-NEXT: sltu a5, a0, a1 +; RV32-NEXT: neg a5, a5 +; RV32-NEXT: sltiu a4, a4, 32 +; RV32-NEXT: seqz a5, a5 +; RV32-NEXT: and a4, a5, a4 +; RV32-NEXT: bnez a4, .LBB0_7 +; RV32-NEXT: # %bb.3: # %vector.ph +; RV32-NEXT: lui a5, 524288 +; RV32-NEXT: addi a5, a5, -8 +; RV32-NEXT: and a5, a3, a5 +; RV32-NEXT: li a7, 0 +; RV32-NEXT: li a6, 0 +; RV32-NEXT: .LBB0_4: # %vector.body +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: slli t0, a7, 2 +; RV32-NEXT: addi t1, a7, 8 +; RV32-NEXT: add t0, a1, t0 +; RV32-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-NEXT: vle32.v v8, (t0) +; RV32-NEXT: sltu a7, t1, a7 +; RV32-NEXT: xor t0, t1, a5 +; RV32-NEXT: add a6, a6, a7 +; RV32-NEXT: vmslt.vx v10, v8, a2 +; RV32-NEXT: vcompress.vm v12, v8, v10 +; RV32-NEXT: vcpop.m a7, v10 +; RV32-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV32-NEXT: vse32.v v12, (a0) +; RV32-NEXT: slli a7, a7, 2 +; RV32-NEXT: or t0, t0, a6 +; RV32-NEXT: add a0, a0, a7 +; RV32-NEXT: mv a7, t1 +; RV32-NEXT: bnez t0, .LBB0_4 +; RV32-NEXT: # %bb.5: # %middle.block +; RV32-NEXT: bne a5, a3, .LBB0_9 +; RV32-NEXT: .LBB0_6: # %for.cond.cleanup +; RV32-NEXT: ret +; RV32-NEXT: .LBB0_7: +; RV32-NEXT: li a5, 0 +; RV32-NEXT: li a4, 0 +; RV32-NEXT: j .LBB0_9 +; RV32-NEXT: .LBB0_8: # %for.inc +; RV32-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV32-NEXT: addi a5, a5, 1 +; RV32-NEXT: seqz a6, a5 +; RV32-NEXT: add a4, a4, a6 +; RV32-NEXT: xor a6, a5, a3 +; RV32-NEXT: or a6, a6, a4 +; RV32-NEXT: beqz a6, .LBB0_6 +; RV32-NEXT: .LBB0_9: # %for.body +; RV32-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NEXT: slli a6, a5, 2 +; RV32-NEXT: add a6, a1, a6 +; RV32-NEXT: lw a6, 0(a6) +; RV32-NEXT: bge a6, a2, .LBB0_8 +; RV32-NEXT: # %bb.10: # %if.then +; RV32-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV32-NEXT: addi a7, a0, 4 +; RV32-NEXT: sw a6, 0(a0) +; RV32-NEXT: mv a0, a7 +; RV32-NEXT: j .LBB0_8 +; +; RV64-LABEL: test_store1: +; RV64: # %bb.0: # %entry +; RV64-NEXT: blez a3, .LBB0_6 +; RV64-NEXT: # %bb.1: # %for.body.preheader +; RV64-NEXT: li a5, 8 +; RV64-NEXT: li a4, 0 +; RV64-NEXT: bltu a3, a5, .LBB0_7 +; RV64-NEXT: # %bb.2: # %for.body.preheader +; RV64-NEXT: sub a5, a0, a1 +; RV64-NEXT: li a6, 31 +; RV64-NEXT: bgeu a6, a5, .LBB0_7 +; RV64-NEXT: # %bb.3: # %vector.ph +; RV64-NEXT: lui a4, 524288 +; RV64-NEXT: addiw a4, a4, -8 +; RV64-NEXT: and a4, a3, a4 +; RV64-NEXT: slli a5, a4, 2 +; RV64-NEXT: add a5, a5, a1 +; RV64-NEXT: mv a6, a1 +; RV64-NEXT: .LBB0_4: # %vector.body +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV64-NEXT: vle32.v v8, (a6) +; RV64-NEXT: addi a6, a6, 32 +; RV64-NEXT: vmslt.vx v10, v8, a2 +; RV64-NEXT: vcompress.vm v12, v8, v10 +; RV64-NEXT: vcpop.m a7, v10 +; RV64-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; RV64-NEXT: vse32.v v12, (a0) +; RV64-NEXT: slli a7, a7, 2 +; RV64-NEXT: add a0, a0, a7 +; RV64-NEXT: bne a6, a5, .LBB0_4 +; RV64-NEXT: # %bb.5: # %middle.block +; RV64-NEXT: bne a4, a3, .LBB0_7 +; RV64-NEXT: .LBB0_6: # %for.cond.cleanup +; RV64-NEXT: ret +; RV64-NEXT: .LBB0_7: # %for.body.preheader13 +; RV64-NEXT: slli a4, a4, 2 +; RV64-NEXT: slli a5, a3, 2 +; RV64-NEXT: add a3, a1, a4 +; RV64-NEXT: add a1, a1, a5 +; RV64-NEXT: j .LBB0_9 +; RV64-NEXT: .LBB0_8: # %for.inc +; RV64-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV64-NEXT: addi a3, a3, 4 +; RV64-NEXT: beq a3, a1, .LBB0_6 +; RV64-NEXT: .LBB0_9: # %for.body +; RV64-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NEXT: lw a4, 0(a3) +; RV64-NEXT: bge a4, a2, .LBB0_8 +; RV64-NEXT: # %bb.10: # %if.then +; RV64-NEXT: # in Loop: Header=BB0_9 Depth=1 +; RV64-NEXT: addi a5, a0, 4 +; RV64-NEXT: sw a4, 0(a0) +; RV64-NEXT: mv a0, a5 +; RV64-NEXT: j .LBB0_8 +entry: + %cmp8 = icmp sgt i32 %n, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %dst11 = ptrtoint ptr %dst to i64 + %src12 = ptrtoint ptr %src to i64 + %wide.trip.count = zext nneg i32 %n to i64 + %min.iters.check = icmp ult i32 %n, 8 + %0 = sub i64 %dst11, %src12 + %diff.check = icmp ult i64 %0, 32 + %or.cond = or i1 %min.iters.check, %diff.check + br i1 %or.cond, label %for.body.preheader13, label %vector.ph + +for.body.preheader13: ; preds = %middle.block, %for.body.preheader + %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ] + %dst.addr.09.ph = phi ptr [ %dst, %for.body.preheader ], [ %monotonic.add, %middle.block ] + br label %for.body + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i64 %wide.trip.count, 2147483640 + %broadcast.splatinsert = insertelement <8 x i32> poison, i32 %c, i64 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> poison, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %monotonic.iv = phi ptr [ %dst, %vector.ph ], [ %monotonic.add, %vector.body ] + %1 = getelementptr inbounds i32, ptr %src, i64 %index + %wide.load = load <8 x i32>, ptr %1, align 4 + %2 = icmp slt <8 x i32> %wide.load, %broadcast.splat + tail call void @llvm.masked.compressstore.v8i32(<8 x i32> %wide.load, ptr align 4 %monotonic.iv, <8 x i1> %2) + %3 = bitcast <8 x i1> %2 to i8 + %4 = tail call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 %3) + %5 = shl nuw nsw i8 %4, 2 + %6 = zext nneg i8 %5 to i64 + %monotonic.add = getelementptr inbounds i8, ptr %monotonic.iv, i64 %6 + %index.next = add nuw i64 %index, 8 + %7 = icmp eq i64 %index.next, %n.vec + br i1 %7, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i64 %n.vec, %wide.trip.count + br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader13 + +for.cond.cleanup: ; preds = %for.inc, %middle.block, %entry + ret void + +for.body: ; preds = %for.body.preheader13, %for.inc + %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ %indvars.iv.ph, %for.body.preheader13 ] + %dst.addr.09 = phi ptr [ %dst.addr.1, %for.inc ], [ %dst.addr.09.ph, %for.body.preheader13 ] + %arrayidx = getelementptr inbounds i32, ptr %src, i64 %indvars.iv + %8 = load i32, ptr %arrayidx, align 4 + %cmp1 = icmp slt i32 %8, %c + br i1 %cmp1, label %if.then, label %for.inc + +if.then: ; preds = %for.body + %incdec.ptr = getelementptr inbounds i8, ptr %dst.addr.09, i64 4 + store i32 %8, ptr %dst.addr.09, align 4 + br label %for.inc + +for.inc: ; preds = %for.body, %if.then + %dst.addr.1 = phi ptr [ %incdec.ptr, %if.then ], [ %dst.addr.09, %for.body ] + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/SPARC/fp16-promote.ll b/llvm/test/CodeGen/SPARC/fp16-promote.ll index a15104c7b8cff..efe67b04e8fb3 100644 --- a/llvm/test/CodeGen/SPARC/fp16-promote.ll +++ b/llvm/test/CodeGen/SPARC/fp16-promote.ll @@ -20,7 +20,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; V8-LABEL: test_fpextend_float: ; V8: ! %bb.0: ; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __gnu_h2f_ieee +; V8-NEXT: call __extendhfsf2 ; V8-NEXT: lduh [%i0], %o0 ; V8-NEXT: ret ; V8-NEXT: restore @@ -28,7 +28,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; V9-LABEL: test_fpextend_float: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: ret ; V9-NEXT: restore @@ -36,7 +36,7 @@ define float @test_fpextend_float(ptr %p) nounwind { ; SPARC64-LABEL: test_fpextend_float: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: ret ; SPARC64-NEXT: restore @@ -49,7 +49,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; V8-LABEL: test_fpextend_double: ; V8: ! %bb.0: ; V8-NEXT: save %sp, -96, %sp -; V8-NEXT: call __gnu_h2f_ieee +; V8-NEXT: call __extendhfsf2 ; V8-NEXT: lduh [%i0], %o0 ; V8-NEXT: fstod %f0, %f0 ; V8-NEXT: ret @@ -58,7 +58,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; V9-LABEL: test_fpextend_double: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: fstod %f0, %f0 ; V9-NEXT: ret @@ -67,7 +67,7 @@ define double @test_fpextend_double(ptr %p) nounwind { ; SPARC64-LABEL: test_fpextend_double: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: fstod %f0, %f0 ; SPARC64-NEXT: ret @@ -81,7 +81,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V8-OPT-LABEL: test_fpextend_fp128: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -112, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-20] ; V8-OPT-NEXT: add %fp, -16, %i0 @@ -99,7 +99,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V8-UNOPT-LABEL: test_fpextend_fp128: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -112, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-20] ; V8-UNOPT-NEXT: add %fp, -16, %i0 @@ -125,7 +125,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; V9-LABEL: test_fpextend_fp128: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -112, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-20] ; V9-NEXT: add %fp, -16, %i0 @@ -143,7 +143,7 @@ define void @test_fpextend_fp128(ptr %p, ptr %out) nounwind { ; SPARC64-LABEL: test_fpextend_fp128: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: add %fp, 2031, %o0 ; SPARC64-NEXT: fmovs %f0, %f3 @@ -165,7 +165,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V8-OPT-LABEL: test_fptrunc_float: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -96, %sp -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: mov %i0, %o0 ; V8-OPT-NEXT: sth %o0, [%i1] ; V8-OPT-NEXT: ret @@ -176,7 +176,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V8-UNOPT-NEXT: save %sp, -96, %sp ; V8-UNOPT-NEXT: mov %i0, %o0 ; V8-UNOPT-NEXT: st %o0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %f0 ; V8-UNOPT-NEXT: sth %o0, [%i1] ; V8-UNOPT-NEXT: ret @@ -185,7 +185,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; V9-LABEL: test_fptrunc_float: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -96, %sp -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: mov %i0, %o0 ; V9-NEXT: sth %o0, [%i1] ; V9-NEXT: ret @@ -194,7 +194,7 @@ define void @test_fptrunc_float(float %f, ptr %p) nounwind { ; SPARC64-LABEL: test_fptrunc_float: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -176, %sp -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: nop ; SPARC64-NEXT: sth %o0, [%i1] ; SPARC64-NEXT: ret @@ -329,15 +329,15 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fadd: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-OPT-NEXT: fadds %f1, %f0, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 ; V8-OPT-NEXT: sth %o0, [%i0] ; V8-OPT-NEXT: ret @@ -346,16 +346,16 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fadd: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 ; V8-UNOPT-NEXT: fmovs %f0, %f1 ; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fadds %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %o0 ; V8-UNOPT-NEXT: sth %o0, [%i0] ; V8-UNOPT-NEXT: ret @@ -364,15 +364,15 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fadd: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V9-NEXT: fadds %f1, %f0, %f0 ; V9-NEXT: st %f0, [%fp+-4] -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 ; V9-NEXT: sth %o0, [%i0] ; V9-NEXT: ret @@ -381,13 +381,13 @@ define void @test_fadd(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fadd: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: fadds %f1, %f0, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret @@ -403,15 +403,15 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-OPT-LABEL: test_fmul: ; V8-OPT: ! %bb.0: ; V8-OPT-NEXT: save %sp, -104, %sp -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i0], %o0 ; V8-OPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-OPT-NEXT: call __gnu_h2f_ieee +; V8-OPT-NEXT: call __extendhfsf2 ; V8-OPT-NEXT: lduh [%i1], %o0 ; V8-OPT-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V8-OPT-NEXT: fmuls %f1, %f0, %f0 ; V8-OPT-NEXT: st %f0, [%fp+-4] -; V8-OPT-NEXT: call __gnu_f2h_ieee +; V8-OPT-NEXT: call __truncsfhf2 ; V8-OPT-NEXT: ld [%fp+-4], %o0 ; V8-OPT-NEXT: sth %o0, [%i0] ; V8-OPT-NEXT: ret @@ -420,16 +420,16 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V8-UNOPT-LABEL: test_fmul: ; V8-UNOPT: ! %bb.0: ; V8-UNOPT-NEXT: save %sp, -104, %sp -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i0], %o0 ; V8-UNOPT-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V8-UNOPT-NEXT: call __gnu_h2f_ieee +; V8-UNOPT-NEXT: call __extendhfsf2 ; V8-UNOPT-NEXT: lduh [%i1], %o0 ; V8-UNOPT-NEXT: fmovs %f0, %f1 ; V8-UNOPT-NEXT: ld [%fp+-8], %f0 ! 4-byte Folded Reload ; V8-UNOPT-NEXT: fmuls %f0, %f1, %f0 ; V8-UNOPT-NEXT: st %f0, [%fp+-4] -; V8-UNOPT-NEXT: call __gnu_f2h_ieee +; V8-UNOPT-NEXT: call __truncsfhf2 ; V8-UNOPT-NEXT: ld [%fp+-4], %o0 ; V8-UNOPT-NEXT: sth %o0, [%i0] ; V8-UNOPT-NEXT: ret @@ -438,15 +438,15 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; V9-LABEL: test_fmul: ; V9: ! %bb.0: ; V9-NEXT: save %sp, -104, %sp -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i0], %o0 ; V9-NEXT: st %f0, [%fp+-8] ! 4-byte Folded Spill -; V9-NEXT: call __gnu_h2f_ieee +; V9-NEXT: call __extendhfsf2 ; V9-NEXT: lduh [%i1], %o0 ; V9-NEXT: ld [%fp+-8], %f1 ! 4-byte Folded Reload ; V9-NEXT: fmuls %f1, %f0, %f0 ; V9-NEXT: st %f0, [%fp+-4] -; V9-NEXT: call __gnu_f2h_ieee +; V9-NEXT: call __truncsfhf2 ; V9-NEXT: ld [%fp+-4], %o0 ; V9-NEXT: sth %o0, [%i0] ; V9-NEXT: ret @@ -455,13 +455,13 @@ define void @test_fmul(ptr %p, ptr %q) nounwind { ; SPARC64-LABEL: test_fmul: ; SPARC64: ! %bb.0: ; SPARC64-NEXT: save %sp, -192, %sp -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i0], %o0 ; SPARC64-NEXT: st %f0, [%fp+2043] ! 4-byte Folded Spill -; SPARC64-NEXT: call __gnu_h2f_ieee +; SPARC64-NEXT: call __extendhfsf2 ; SPARC64-NEXT: lduh [%i1], %o0 ; SPARC64-NEXT: ld [%fp+2043], %f1 ! 4-byte Folded Reload -; SPARC64-NEXT: call __gnu_f2h_ieee +; SPARC64-NEXT: call __truncsfhf2 ; SPARC64-NEXT: fmuls %f1, %f0, %f1 ; SPARC64-NEXT: sth %o0, [%i0] ; SPARC64-NEXT: ret diff --git a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll index 4e30778d5c158..f105966bc4d08 100644 --- a/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll +++ b/llvm/test/CodeGen/VE/Scalar/fp_extload_truncstore.ll @@ -26,9 +26,9 @@ define float @func_i16fp32(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB0_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 ; CHECK-NEXT: ld %s10, 8(, %s11) @@ -58,9 +58,9 @@ define double @func_i16fp64(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB1_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -91,9 +91,9 @@ define float @func_fp16fp32(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB2_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s11, 0, %s9 ; CHECK-NEXT: ld %s10, 8(, %s11) @@ -123,9 +123,9 @@ define double @func_fp16fp64(ptr %a) { ; CHECK-NEXT: or %s0, 0, %s62 ; CHECK-NEXT: .LBB3_2: ; CHECK-NEXT: ld2b.zx %s0, (, %s0) -; CHECK-NEXT: lea %s1, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s1, __extendhfsf2@lo ; CHECK-NEXT: and %s1, %s1, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s1) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s1) ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: cvt.d.s %s0, %s0 ; CHECK-NEXT: or %s11, 0, %s9 @@ -157,9 +157,9 @@ define void @func_fp32i16(ptr %fl.ptr, float %val) { ; CHECK-NEXT: .LBB4_2: ; CHECK-NEXT: st %s18, 288(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: or %s18, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_f2h_ieee@lo +; CHECK-NEXT: lea %s0, __truncsfhf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_f2h_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __truncsfhf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: st2b %s0, (, %s18) @@ -194,15 +194,15 @@ define half @func_fp32fp16(ptr %fl.ptr, float %a) { ; CHECK-NEXT: st %s18, 288(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: st %s19, 296(, %s11) # 8-byte Folded Spill ; CHECK-NEXT: or %s18, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_f2h_ieee@lo +; CHECK-NEXT: lea %s0, __truncsfhf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_f2h_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __truncsfhf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s1 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: or %s19, 0, %s0 -; CHECK-NEXT: lea %s0, __gnu_h2f_ieee@lo +; CHECK-NEXT: lea %s0, __extendhfsf2@lo ; CHECK-NEXT: and %s0, %s0, (32)0 -; CHECK-NEXT: lea.sl %s12, __gnu_h2f_ieee@hi(, %s0) +; CHECK-NEXT: lea.sl %s12, __extendhfsf2@hi(, %s0) ; CHECK-NEXT: or %s0, 0, %s19 ; CHECK-NEXT: bsic %s10, (, %s12) ; CHECK-NEXT: st2b %s19, (, %s18) diff --git a/llvm/test/CodeGen/X86/cvt16.ll b/llvm/test/CodeGen/X86/cvt16.ll index c6c088297c0ea..db615c8065d03 100644 --- a/llvm/test/CodeGen/X86/cvt16.ll +++ b/llvm/test/CodeGen/X86/cvt16.ll @@ -41,7 +41,7 @@ define void @test1(float %src, ptr %dest) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rbx ; SOFTFLOAT-NEXT: movq %rsi, %rbx -; SOFTFLOAT-NEXT: callq __gnu_f2h_ieee@PLT +; SOFTFLOAT-NEXT: callq __truncsfhf2@PLT ; SOFTFLOAT-NEXT: movw %ax, (%rbx) ; SOFTFLOAT-NEXT: popq %rbx ; SOFTFLOAT-NEXT: retq @@ -66,7 +66,7 @@ define float @test2(ptr nocapture %src) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: popq %rcx ; SOFTFLOAT-NEXT: retq %1 = load i16, ptr %src, align 2 @@ -94,9 +94,9 @@ define float @test3(float %src) nounwind uwtable readnone { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 16 -; SOFTFLOAT-NEXT: callq __gnu_f2h_ieee@PLT +; SOFTFLOAT-NEXT: callq __truncsfhf2@PLT ; SOFTFLOAT-NEXT: movzwl %ax, %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: popq %rcx ; SOFTFLOAT-NEXT: .cfi_def_cfa_offset 8 ; SOFTFLOAT-NEXT: retq @@ -126,7 +126,7 @@ define double @test4(ptr nocapture %src) nounwind { ; SOFTFLOAT: # %bb.0: ; SOFTFLOAT-NEXT: pushq %rax ; SOFTFLOAT-NEXT: movzwl (%rdi), %edi -; SOFTFLOAT-NEXT: callq __gnu_h2f_ieee@PLT +; SOFTFLOAT-NEXT: callq __extendhfsf2@PLT ; SOFTFLOAT-NEXT: movl %eax, %edi ; SOFTFLOAT-NEXT: callq __extendsfdf2@PLT ; SOFTFLOAT-NEXT: popq %rcx diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll index 24dabfc18b9e3..16ebf70126f8b 100644 --- a/llvm/test/CodeGen/X86/fmf-flags.ll +++ b/llvm/test/CodeGen/X86/fmf-flags.ll @@ -124,13 +124,13 @@ define dso_local float @div_arcp_by_const(half %x) { ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: fmuls {{\.?LCPI[0-9]+_[0-9]+}} ; X86-NEXT: fstps (%esp) -; X86-NEXT: calll __gnu_f2h_ieee +; X86-NEXT: calll __truncsfhf2 ; X86-NEXT: movzwl %ax, %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: popl %eax ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/fp-i129.ll b/llvm/test/CodeGen/X86/fp-i129.ll index 97116ce4e621f..c55c19abbd9b8 100644 --- a/llvm/test/CodeGen/X86/fp-i129.ll +++ b/llvm/test/CodeGen/X86/fp-i129.ll @@ -96,7 +96,7 @@ define i257 @fptosi257_double(double %a) nounwind { ; half tests define i257 @fptosi_half(half %a) nounwind { ; X86-LABEL: fptosi_half: -; X86: __gnu_h2f_ieee +; X86: __extendhfsf2 ; ; X64-LABEL: fptosi_half: ; X64: __extendhfsf2 @@ -106,7 +106,7 @@ define i257 @fptosi_half(half %a) nounwind { define half @uitofp_half(i257 %a) nounwind { ; X86-LABEL: uitofp_half: -; X86: __gnu_f2h_ieee +; X86: __truncsfhf2 ; ; X64-LABEL: uitofp_half: ; X64: __truncsfhf2 diff --git a/llvm/test/CodeGen/X86/fp128-cast-strict.ll b/llvm/test/CodeGen/X86/fp128-cast-strict.ll index f141153d059ac..707b05f3478db 100644 --- a/llvm/test/CodeGen/X86/fp128-cast-strict.ll +++ b/llvm/test/CodeGen/X86/fp128-cast-strict.ll @@ -28,7 +28,7 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp { ; X64-AVX512-LABEL: TestFPExtF16_F128: ; X64-AVX512: # %bb.0: # %entry ; X64-AVX512-NEXT: pushq %rax -; X64-AVX512-NEXT: vmovsh vf16(%rip), %xmm0 +; X64-AVX512-NEXT: vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero ; X64-AVX512-NEXT: callq __extendhftf2@PLT ; X64-AVX512-NEXT: vmovaps %xmm0, vf128(%rip) ; X64-AVX512-NEXT: popq %rax @@ -40,7 +40,7 @@ define dso_local void @TestFPExtF16_F128() nounwind strictfp { ; X86-NEXT: subl $40, %esp ; X86-NEXT: movzwl vf16, %eax ; X86-NEXT: movl %eax, (%esp) -; X86-NEXT: calll __gnu_h2f_ieee +; X86-NEXT: calll __extendhfsf2 ; X86-NEXT: fstps {{[0-9]+}}(%esp) ; X86-NEXT: wait ; X86-NEXT: leal {{[0-9]+}}(%esp), %eax diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll index 04fce7badb951..85f4c945230e1 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -2060,7 +2060,7 @@ define i1 @test_signed_i1_f16(half %f) nounwind { ; X86-X87-NEXT: subl $24, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2158,7 +2158,7 @@ define i8 @test_signed_i8_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2253,7 +2253,7 @@ define i13 @test_signed_i13_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2349,7 +2349,7 @@ define i16 @test_signed_i16_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2445,7 +2445,7 @@ define i19 @test_signed_i19_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2538,7 +2538,7 @@ define i32 @test_signed_i32_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2633,7 +2633,7 @@ define i50 @test_signed_i50_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2758,7 +2758,7 @@ define i64 @test_signed_i64_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2885,7 +2885,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) @@ -3064,7 +3064,7 @@ define i128 @test_signed_i128_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll index fefc92c313511..47dc3ca3616ea 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -1883,7 +1883,7 @@ define i1 @test_unsigned_i1_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -1965,7 +1965,7 @@ define i8 @test_unsigned_i8_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2047,7 +2047,7 @@ define i13 @test_unsigned_i13_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2128,7 +2128,7 @@ define i16 @test_unsigned_i16_f16(half %f) nounwind { ; X86-X87-NEXT: subl $12, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2209,7 +2209,7 @@ define i19 @test_unsigned_i19_f16(half %f) nounwind { ; X86-X87-NEXT: subl $28, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2295,7 +2295,7 @@ define i32 @test_unsigned_i32_f16(half %f) nounwind { ; X86-X87-NEXT: subl $28, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: fnstcw {{[0-9]+}}(%esp) ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: orl $3072, %eax # imm = 0xC00 @@ -2382,7 +2382,7 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind { ; X86-X87-NEXT: subl $24, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fxch %st(1) ; X86-X87-NEXT: fucom %st(1) @@ -2526,7 +2526,7 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind { ; X86-X87-NEXT: subl $20, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} ; X86-X87-NEXT: fxch %st(1) ; X86-X87-NEXT: fucom %st(1) @@ -2667,7 +2667,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-X87-NEXT: subl $44, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) @@ -2821,7 +2821,7 @@ define i128 @test_unsigned_i128_f16(half %f) nounwind { ; X86-X87-NEXT: subl $60, %esp ; X86-X87-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) -; X86-X87-NEXT: calll __gnu_h2f_ieee +; X86-X87-NEXT: calll __extendhfsf2 ; X86-X87-NEXT: leal {{[0-9]+}}(%esp), %eax ; X86-X87-NEXT: movl %eax, (%esp) ; X86-X87-NEXT: fsts {{[0-9]+}}(%esp) diff --git a/llvm/test/CodeGen/X86/frem.ll b/llvm/test/CodeGen/X86/frem.ll index 35d16c3bac70d..959265d08299a 100644 --- a/llvm/test/CodeGen/X86/frem.ll +++ b/llvm/test/CodeGen/X86/frem.ll @@ -82,7 +82,7 @@ define void @frem_f128(fp128 %a0, fp128 %a1, ptr%p3) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: pushq %rbx ; CHECK-NEXT: movq %rdi, %rbx -; CHECK-NEXT: callq fmodf128 +; CHECK-NEXT: callq fmodf128@PLT ; CHECK-NEXT: vmovaps %xmm0, (%rbx) ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/half-constrained.ll b/llvm/test/CodeGen/X86/half-constrained.ll index 0f73129d984bd..f1874cc03000a 100644 --- a/llvm/test/CodeGen/X86/half-constrained.ll +++ b/llvm/test/CodeGen/X86/half-constrained.ll @@ -15,7 +15,7 @@ define float @half_to_float() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -64,7 +64,7 @@ define double @half_to_double() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -116,7 +116,7 @@ define x86_fp80 @half_to_fp80() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 ; X86-NOF16C-NEXT: retl @@ -166,7 +166,7 @@ define void @float_to_half(float %0) strictfp { ; X86-NOF16C-NEXT: flds {{[0-9]+}}(%esp) ; X86-NOF16C-NEXT: fstps (%esp) ; X86-NOF16C-NEXT: wait -; X86-NOF16C-NEXT: calll __gnu_f2h_ieee +; X86-NOF16C-NEXT: calll __truncsfhf2 ; X86-NOF16C-NEXT: movw %ax, a ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 @@ -324,17 +324,17 @@ define void @add() strictfp { ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 16 ; X86-NOF16C-NEXT: movzwl a, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: fstps {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; X86-NOF16C-NEXT: wait ; X86-NOF16C-NEXT: movzwl b, %eax ; X86-NOF16C-NEXT: movl %eax, (%esp) -; X86-NOF16C-NEXT: calll __gnu_h2f_ieee +; X86-NOF16C-NEXT: calll __extendhfsf2 ; X86-NOF16C-NEXT: flds {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload ; X86-NOF16C-NEXT: faddp %st, %st(1) ; X86-NOF16C-NEXT: fstps (%esp) ; X86-NOF16C-NEXT: wait -; X86-NOF16C-NEXT: calll __gnu_f2h_ieee +; X86-NOF16C-NEXT: calll __truncsfhf2 ; X86-NOF16C-NEXT: movw %ax, c ; X86-NOF16C-NEXT: addl $12, %esp ; X86-NOF16C-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll index 3c6e14598571d..859139463b7e3 100644 --- a/llvm/test/CodeGen/X86/ldexp.ll +++ b/llvm/test/CodeGen/X86/ldexp.ll @@ -608,14 +608,14 @@ define half @ldexp_f16(half %arg0, i32 %arg1) { ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: movl %esi, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) ; WIN32-NEXT: calll _ldexp ; WIN32-NEXT: fstps {{[0-9]+}}(%esp) ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: addl $16, %esp ; WIN32-NEXT: popl %esi ; WIN32-NEXT: retl diff --git a/llvm/test/CodeGen/X86/llvm.frexp.ll b/llvm/test/CodeGen/X86/llvm.frexp.ll index 96de34519556d..8436c1052552e 100644 --- a/llvm/test/CodeGen/X86/llvm.frexp.ll +++ b/llvm/test/CodeGen/X86/llvm.frexp.ll @@ -45,7 +45,7 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) @@ -54,7 +54,7 @@ define { half, i32 } @test_frexp_f16_i32(half %a) { ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) ; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: movl %esi, %edx ; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: popl %esi @@ -95,7 +95,7 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; WIN32-NEXT: subl $20, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) @@ -103,7 +103,7 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) { ; WIN32-NEXT: fstps {{[0-9]+}}(%esp) ; WIN32-NEXT: flds {{[0-9]+}}(%esp) ; WIN32-NEXT: fstps (%esp) -; WIN32-NEXT: calll ___gnu_f2h_ieee +; WIN32-NEXT: calll ___truncsfhf2 ; WIN32-NEXT: addl $20, %esp ; WIN32-NEXT: retl %result = call { half, i32 } @llvm.frexp.f16.i32(half %a) @@ -146,7 +146,7 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) { ; WIN32-NEXT: subl $16, %esp ; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, (%esp) -; WIN32-NEXT: calll ___gnu_h2f_ieee +; WIN32-NEXT: calll ___extendhfsf2 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %eax ; WIN32-NEXT: movl %eax, {{[0-9]+}}(%esp) ; WIN32-NEXT: fstpl (%esp) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index be83db26aa7ed..89ed0040a71c2 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -1215,10 +1215,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512BW-NEXT: kmovq %rcx, %k1 ; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} @@ -1294,10 +1294,10 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 ; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index ba51c65ccab13..75c470a6d40c6 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -1161,23 +1161,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1231,23 +1231,23 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm1 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm1[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-NEXT: movl $287445282, %ecx # imm = 0x11221122 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm1[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm2[2,3,0,1,2,3,0,1] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] ; AVX512DQ-BW-NEXT: movl $1149781128, %ecx # imm = 0x44884488 ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm1 {%k1} ; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2126,41 +2126,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] -; AVX512BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] +; AVX512BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} -; AVX512BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} ; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] -; AVX512BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] +; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2231,41 +2230,40 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r10), %ymm3, %ymm3 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r9), %ymm2, %ymm2 ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14] -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm5, %zmm6, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm7 = zmm2[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm6 = [0,2,0,2,0,2,0,2] +; AVX512DQ-BW-FCP-NEXT: # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm6, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm7 = zmm7[u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] ; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %ecx # imm = 0x88888888 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm4, %zmm6, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm3 = zmm0[0,2,0,2,4,6,4,6] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm6, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm4 = zmm4[0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm6, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm6 = zmm6[u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,18,26,u,u,u,u,u,u,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $572662306, %ecx # imm = 0x22222222 ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm4 {%k2} ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm7, %zmm3 {%k3} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [5,7,5,7,5,7,5,7] -; AVX512DQ-BW-FCP-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm5, %zmm1, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[1,3,1,3,5,7,5,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm4 {%k3} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm2 = zmm2[u,u,u,u,0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm5, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm4, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,8,0,8,u,u,u,u,1,9,1,9,u,u,u,u,18,26,18,26,u,u,u,u,19,27,19,27,u,u,u,u,36,44,u,u,u,u,u,u,37,45,u,u,u,u,u,u,54,62,u,u,u,u,u,u,55,63,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} ; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm0 {%k3} ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 @@ -6905,7 +6903,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload @@ -6927,7 +6925,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload @@ -6944,7 +6942,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -6968,7 +6966,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 @@ -7035,7 +7033,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX512-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero @@ -7057,7 +7055,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 @@ -7070,7 +7068,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 @@ -7083,7 +7081,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11) ; AVX512-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] ; AVX512-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 @@ -7589,7 +7587,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm11 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpbroadcastq {{.*#+}} zmm11 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535] -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm0 ^ (zmm11 & (zmm22 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm22 = zmm22 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload @@ -7611,7 +7609,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm10 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm0 ^ (zmm11 & (zmm24 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm24 = zmm24 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd $96, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm0 = mem[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd $232, {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Folded Reload @@ -7628,7 +7626,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm10 = ymm29[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm10 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm0 ^ (zmm11 & (zmm10 ^ zmm0)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm10 = zmm10 | (zmm0 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm25[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm23[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 @@ -7652,7 +7650,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 ; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm4 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm1 ^ (zmm11 & (zmm4 ^ zmm1)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | (zmm1 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm20[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1 @@ -7719,7 +7717,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm14 ^ (zmm11 & (zmm0 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero,xmm8[2],zero,zero,zero,xmm8[3],zero,zero,zero,xmm8[4],zero,zero,zero,xmm8[5],zero,zero,zero,xmm8[6],zero,zero,zero,xmm8[7],zero,zero,zero ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero,xmm6[4],zero,zero,zero,xmm6[5],zero,zero,zero,xmm6[6],zero,zero,zero,xmm6[7],zero,zero,zero ; AVX512DQ-NEXT: vpmovzxwq {{.*#+}} zmm12 = xmm12[0],zero,zero,zero,xmm12[1],zero,zero,zero,xmm12[2],zero,zero,zero,xmm12[3],zero,zero,zero,xmm12[4],zero,zero,zero,xmm12[5],zero,zero,zero,xmm12[6],zero,zero,zero,xmm12[7],zero,zero,zero @@ -7741,7 +7739,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd $246, {{[-0-9]+}}(%r{{[sb]}}p), %ymm25 # 32-byte Folded Reload ; AVX512DQ-NEXT: # ymm25 = mem[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm14 ^ (zmm11 & (zmm8 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm30[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm25 = ymm31[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm25, %zmm14, %zmm14 @@ -7754,7 +7752,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm19[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm17 = ymm20[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm17, %zmm14, %zmm14 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm14 ^ (zmm11 & (zmm6 ^ zmm14)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm6 = zmm6 | (zmm14 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm14 = ymm23[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm14, %zmm9 @@ -7767,7 +7765,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm13[0,1,1,3,4,5,5,7] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm15[2,1,3,3,6,5,7,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm2 ^ (zmm11 & (zmm12 ^ zmm2)) +; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm12 = zmm12 | (zmm2 & ~zmm11) ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm2 = ymm16[0,0,2,1,4,4,6,5] ; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm3 = ymm18[0,2,2,3,4,6,6,7] ; AVX512DQ-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll index a8df418143f32..717d1e447e165 100644 --- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll +++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll @@ -98,8 +98,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512F-ONLY: # %bb.0: ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -110,8 +109,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovb (%rdi), %k0 ; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -122,8 +120,7 @@ define void @mask_replication_factor2_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k1 ; AVX512BW-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512BW-NEXT: vptestmd %zmm0, %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: vmovdqa64 %zmm0, (%rdx) @@ -145,8 +142,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 ; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} @@ -162,8 +158,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 ; AVX512DQ-NEXT: vpmovd2m %zmm1, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 ; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm0 {%k2} {z} ; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm1 {%k1} {z} @@ -176,8 +171,7 @@ define void @mask_replication_factor2_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxwd %ymm0, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 @@ -200,21 +194,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512F-ONLY-NEXT: kmovw (%rdi), %k1 ; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k2 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k2 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k1} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm1 ; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k3 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) @@ -227,21 +220,20 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: kmovw (%rdi), %k0 ; AVX512DQ-NEXT: kmovw 2(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k2 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k3 +; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k4} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k3} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm3, 128(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm2, 192(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm1, (%rdx) @@ -249,47 +241,25 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-ONLY-LABEL: mask_replication_factor2_vf32: -; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] -; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] -; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-ONLY-NEXT: vzeroupper -; AVX512BW-ONLY-NEXT: retq -; -; AVX512VBMI-ONLY-LABEL: mask_replication_factor2_vf32: -; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vzeroupper -; AVX512VBMI-ONLY-NEXT: retq +; AVX512BW-LABEL: mask_replication_factor2_vf32: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vpmovsxbw %ymm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <32 x i32> %tgt.mask = shufflevector <32 x i1> %src.mask, <32 x i1> poison, <64 x i32> @@ -301,42 +271,41 @@ define void @mask_replication_factor2_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %out.vec) nounwind { ; AVX512F-ONLY-LABEL: mask_replication_factor2_vf64: ; AVX512F-ONLY: # %bb.0: -; AVX512F-ONLY-NEXT: kmovw (%rdi), %k3 -; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k5 -; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k4 +; AVX512F-ONLY-NEXT: kmovw (%rdi), %k4 +; AVX512F-ONLY-NEXT: kmovw 2(%rdi), %k3 +; AVX512F-ONLY-NEXT: kmovw 4(%rdi), %k2 ; AVX512F-ONLY-NEXT: kmovw 6(%rdi), %k1 ; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512F-ONLY-NEXT: vptestmd %zmm2, %zmm2, %k1 ; AVX512F-ONLY-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512F-ONLY-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k2 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k4} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k4 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm2 {%k2} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm2, %zmm1, %zmm3 +; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k2 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm3 {%k3} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm3, %zmm1, %zmm4 +; AVX512F-ONLY-NEXT: vptestmd %zmm4, %zmm4, %k3 +; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm4 {%k4} {z} = -1 +; AVX512F-ONLY-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k5 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm0, %zmm0 +; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k4 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k6 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k5} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512F-ONLY-NEXT: vptestmd %zmm3, %zmm3, %k5 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm3, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k7 -; AVX512F-ONLY-NEXT: vpternlogd {{.*#+}} zmm0 {%k3} {z} = -1 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512F-ONLY-NEXT: vptestmd %zmm1, %zmm1, %k3 -; AVX512F-ONLY-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512F-ONLY-NEXT: vpmovsxdq %ymm4, %zmm0 ; AVX512F-ONLY-NEXT: vptestmd %zmm0, %zmm0, %k1 -; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k3} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k7} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k6} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k4} {z} -; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z} ; AVX512F-ONLY-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} +; AVX512F-ONLY-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z} ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm6, 448(%rdx) ; AVX512F-ONLY-NEXT: vmovdqa64 %zmm5, 256(%rdx) @@ -351,41 +320,40 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-LABEL: mask_replication_factor2_vf64: ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: kmovw (%rdi), %k0 -; AVX512DQ-NEXT: kmovw 2(%rdi), %k5 -; AVX512DQ-NEXT: kmovw 4(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 2(%rdi), %k3 +; AVX512DQ-NEXT: kmovw 4(%rdi), %k2 ; AVX512DQ-NEXT: kmovw 6(%rdi), %k1 ; AVX512DQ-NEXT: vpmovm2d %k1, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm1 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm2 ; AVX512DQ-NEXT: vpmovd2m %zmm2, %k1 ; AVX512DQ-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 -; AVX512DQ-NEXT: vpmovd2m %zmm0, %k2 -; AVX512DQ-NEXT: vpmovm2d %k3, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovm2d %k2, %zmm2 +; AVX512DQ-NEXT: vpermd %zmm2, %zmm1, %zmm3 +; AVX512DQ-NEXT: vpmovd2m %zmm3, %k2 +; AVX512DQ-NEXT: vpmovm2d %k3, %zmm3 +; AVX512DQ-NEXT: vpermd %zmm3, %zmm1, %zmm4 +; AVX512DQ-NEXT: vpmovd2m %zmm4, %k3 +; AVX512DQ-NEXT: vpmovm2d %k0, %zmm4 +; AVX512DQ-NEXT: vpermd %zmm4, %zmm1, %zmm1 +; AVX512DQ-NEXT: vpmovd2m %zmm1, %k5 +; AVX512DQ-NEXT: vpmovsxdq %ymm0, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k4 -; AVX512DQ-NEXT: vpmovm2d %k5, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm3 -; AVX512DQ-NEXT: vpmovd2m %zmm3, %k5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm2, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k6 -; AVX512DQ-NEXT: vpmovm2d %k0, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm1, %zmm1 -; AVX512DQ-NEXT: vpmovd2m %zmm1, %k7 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm2, %zmm0 +; AVX512DQ-NEXT: vpmovsxdq %ymm3, %zmm0 +; AVX512DQ-NEXT: vpmovd2m %zmm0, %k7 +; AVX512DQ-NEXT: vpmovsxdq %ymm4, %zmm0 ; AVX512DQ-NEXT: vpmovd2m %zmm0, %k1 -; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} -; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k7} {z} -; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k6} {z} -; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k5} {z} -; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k4} {z} -; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k3} {z} -; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} +; AVX512DQ-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k3} {z} +; AVX512DQ-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k7} {z} +; AVX512DQ-NEXT: vmovdqa32 320(%rsi), %zmm4 {%k2} {z} +; AVX512DQ-NEXT: vmovdqa32 256(%rsi), %zmm5 {%k6} {z} ; AVX512DQ-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 448(%rsi), %zmm6 {%k1} {z} +; AVX512DQ-NEXT: vmovdqa32 384(%rsi), %zmm7 {%k4} {z} ; AVX512DQ-NEXT: vmovdqa64 %zmm7, 384(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm6, 448(%rdx) ; AVX512DQ-NEXT: vmovdqa64 %zmm5, 256(%rdx) @@ -402,12 +370,9 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,6,7,6,7] -; AVX512BW-ONLY-NEXT: vbroadcasti64x4 {{.*#+}} zmm2 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-ONLY-NEXT: # zmm2 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,2,3,2,3] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -441,8 +406,7 @@ define void @mask_replication_factor2_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbw %ymm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2525,8 +2489,7 @@ define void @mask_replication_factor4_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2w %k0, %zmm0 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7] -; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpmovsxwq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovw2m %zmm0, %k1 ; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm0 {%k1} {z} ; AVX512BW-NEXT: kshiftrd $16, %k1, %k1 @@ -2598,47 +2561,25 @@ define void @mask_replication_factor4_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; -; AVX512BW-ONLY-LABEL: mask_replication_factor4_vf16: -; AVX512BW-ONLY: # %bb.0: -; AVX512BW-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] -; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512BW-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512BW-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512BW-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512BW-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512BW-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512BW-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512BW-ONLY-NEXT: vzeroupper -; AVX512BW-ONLY-NEXT: retq -; -; AVX512VBMI-ONLY-LABEL: mask_replication_factor4_vf16: -; AVX512VBMI-ONLY: # %bb.0: -; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 -; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 -; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 -; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $48, %k1, %k2 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} -; AVX512VBMI-ONLY-NEXT: kshiftrq $32, %k1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm3, 128(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm2, 192(%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm1, (%rdx) -; AVX512VBMI-ONLY-NEXT: vmovdqa64 %zmm0, 64(%rdx) -; AVX512VBMI-ONLY-NEXT: vzeroupper -; AVX512VBMI-ONLY-NEXT: retq +; AVX512BW-LABEL: mask_replication_factor4_vf16: +; AVX512BW: # %bb.0: +; AVX512BW-NEXT: kmovq (%rdi), %k0 +; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 +; AVX512BW-NEXT: vpmovsxbd %xmm0, %zmm0 +; AVX512BW-NEXT: vpmovb2m %zmm0, %k1 +; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 (%rsi), %zmm1 {%k1} {z} +; AVX512BW-NEXT: kshiftrq $48, %k1, %k2 +; AVX512BW-NEXT: vmovdqa32 192(%rsi), %zmm2 {%k2} {z} +; AVX512BW-NEXT: kshiftrq $32, %k1, %k1 +; AVX512BW-NEXT: vmovdqa32 128(%rsi), %zmm3 {%k1} {z} +; AVX512BW-NEXT: vmovdqa64 %zmm3, 128(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 192(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm1, (%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rdx) +; AVX512BW-NEXT: vzeroupper +; AVX512BW-NEXT: retq %src.mask.padded = load <64 x i1>, ptr %in.maskvec, align 64 %src.mask = shufflevector <64 x i1> %src.mask.padded, <64 x i1> poison, <16 x i32> %tgt.mask = shufflevector <16 x i1> %src.mask, <16 x i1> poison, <64 x i32> @@ -2747,11 +2688,9 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: kmovd (%rdi), %k0 ; AVX512BW-ONLY-NEXT: vpmovm2b %k0, %zmm0 ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-ONLY-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,40,40,40,40,41,41,41,41,42,42,42,42,43,43,43,43,60,60,60,60,61,61,61,61,62,62,62,62,63,63,63,63] ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2785,8 +2724,7 @@ define void @mask_replication_factor4_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k1 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k2 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k2, %k3 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -2997,8 +2935,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] ; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm1, %zmm1 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm1, %k3 -; AVX512BW-ONLY-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-ONLY-NEXT: vpshufb %zmm2, %zmm0, %zmm0 +; AVX512BW-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512BW-ONLY-NEXT: vpmovb2m %zmm0, %k4 ; AVX512BW-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -3060,8 +2997,7 @@ define void @mask_replication_factor4_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [16,16,16,16,17,17,17,17,18,18,18,18,19,19,19,19,20,20,20,20,21,21,21,21,22,22,22,22,23,23,23,23,24,24,24,24,25,25,25,25,26,26,26,26,27,27,27,27,28,28,28,28,29,29,29,29,30,30,30,30,31,31,31,31] ; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm1 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm1, %k3 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,7,7,7,7,8,8,8,8,9,9,9,9,10,10,10,10,11,11,11,11,12,12,12,12,13,13,13,13,14,14,14,14,15,15,15,15] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbd %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k4 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k4, %k5 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -12956,8 +12892,7 @@ define void @mask_replication_factor8_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out ; AVX512VBMI-ONLY: # %bb.0: ; AVX512VBMI-ONLY-NEXT: kmovq (%rdi), %k0 ; AVX512VBMI-ONLY-NEXT: vpmovm2b %k0, %zmm0 -; AVX512VBMI-ONLY-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512VBMI-ONLY-NEXT: vpermb %zmm0, %zmm1, %zmm0 +; AVX512VBMI-ONLY-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512VBMI-ONLY-NEXT: vpmovb2m %zmm0, %k1 ; AVX512VBMI-ONLY-NEXT: kshiftrq $16, %k1, %k2 ; AVX512VBMI-ONLY-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k2} {z} @@ -13083,10 +13018,10 @@ define void @mask_replication_factor8_vf16(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: kmovw (%rdi), %k0 ; AVX512BW-NEXT: vpmovm2b %k0, %zmm0 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm0[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,26,26,26,26,26,26,26,26,27,27,27,27,27,27,27,27,44,44,44,44,44,44,44,44,45,45,45,45,45,45,45,45,62,62,62,62,62,62,62,62,63,63,63,63,63,63,63,63] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k1 -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-NEXT: kshiftrq $16, %k2, %k3 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k3} {z} @@ -13291,13 +13226,12 @@ define void @mask_replication_factor8_vf32(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm3 ; AVX512BW-NEXT: vpmovb2m %zmm3, %k1 -; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm3 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm3, %zmm2 +; AVX512BW-NEXT: vpmovb2m %zmm2, %k3 +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,18,18,18,18,18,18,18,18,19,19,19,19,19,19,19,19,36,36,36,36,36,36,36,36,37,37,37,37,37,37,37,37,54,54,54,54,54,54,54,54,55,55,55,55,55,55,55,55] ; AVX512BW-NEXT: vpmovb2m %zmm1, %k2 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpmovb2m %zmm1, %k3 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k4 ; AVX512BW-NEXT: kshiftrq $16, %k4, %k5 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k5} {z} @@ -13680,16 +13614,16 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm2 = [8,8,8,8,8,8,8,8,9,9,9,9,9,9,9,9,10,10,10,10,10,10,10,10,11,11,11,11,11,11,11,11,12,12,12,12,12,12,12,12,13,13,13,13,13,13,13,13,14,14,14,14,14,14,14,14,15,15,15,15,15,15,15,15] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm7 ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,6,6,6,6,6,6,6,6,7,7,7,7,7,7,7,7] -; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm12 +; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm10 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[4,5,4,5,4,5,4,5] ; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm15 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm16 ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[2,3,2,3,2,3,2,3] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm10 +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm11 ; AVX512BW-NEXT: vpshufb %zmm3, %zmm1, %zmm5 -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1] -; AVX512BW-NEXT: vpshufb %zmm2, %zmm0, %zmm1 -; AVX512BW-NEXT: vpshufb %zmm3, %zmm0, %zmm0 +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm0[0,1,0,1,0,1,0,1] +; AVX512BW-NEXT: vpshufb %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpmovsxbq %xmm0, %zmm0 ; AVX512BW-NEXT: vpmovb2m %zmm0, %k2 ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 64(%rsi), %zmm0 {%k1} {z} @@ -13710,9 +13644,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 ; AVX512BW-NEXT: vmovdqa32 576(%rsi), %zmm9 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $32, %k2, %k1 -; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm11 {%k1} {z} -; AVX512BW-NEXT: vpmovb2m %zmm10, %k1 -; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm10 {%k2} {z} +; AVX512BW-NEXT: vmovdqa32 640(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm11, %k1 +; AVX512BW-NEXT: vmovdqa32 512(%rsi), %zmm11 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $48, %k2, %k2 ; AVX512BW-NEXT: vmovdqa32 704(%rsi), %zmm13 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $16, %k1, %k2 @@ -13735,8 +13669,8 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa32 1344(%rsi), %zmm22 {%k2} {z} ; AVX512BW-NEXT: kshiftrq $32, %k1, %k2 ; AVX512BW-NEXT: vmovdqa32 1408(%rsi), %zmm23 {%k2} {z} -; AVX512BW-NEXT: vpmovb2m %zmm12, %k2 -; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm12 {%k1} {z} +; AVX512BW-NEXT: vpmovb2m %zmm10, %k2 +; AVX512BW-NEXT: vmovdqa32 1280(%rsi), %zmm10 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $48, %k1, %k1 ; AVX512BW-NEXT: vmovdqa32 1472(%rsi), %zmm24 {%k1} {z} ; AVX512BW-NEXT: kshiftrq $16, %k2, %k1 @@ -13765,7 +13699,7 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm24, 1472(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm23, 1408(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm22, 1344(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm12, 1280(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 1280(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm21, 1216(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm20, 1152(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm19, 1088(%rdx) @@ -13775,9 +13709,9 @@ define void @mask_replication_factor8_vf64(ptr %in.maskvec, ptr %in.vec, ptr %ou ; AVX512BW-NEXT: vmovdqa64 %zmm14, 832(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm16, 768(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm13, 704(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm11, 640(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm12, 640(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm9, 576(%rdx) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 512(%rdx) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 512(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm8, 448(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm6, 384(%rdx) ; AVX512BW-NEXT: vmovdqa64 %zmm4, 320(%rdx) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index da65fecba773b..d6208aca3b2b7 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1654,12 +1654,27 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { ; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-NEXT: retq ; -; AVX512VL-LABEL: shuffle_v4i64_0044_v2i64: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX512VL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] -; AVX512VL-NEXT: retq +; AVX512VL-SLOW-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [0,0,4,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_0044_v2i64: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> @@ -1667,12 +1682,34 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { } define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) { -; ALL-LABEL: shuffle_v4i64_1032_v2i64: -; ALL: # %bb.0: -; ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; ALL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; ALL-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; ALL-NEXT: retq +; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64: +; AVX1OR2: # %bb.0: +; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX1OR2-NEXT: retq +; +; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-SLOW: # %bb.0: +; AVX512VL-SLOW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-SLOW-NEXT: retq +; +; AVX512VL-FAST-ALL-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-FAST-ALL: # %bb.0: +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-FAST-ALL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-ALL-NEXT: vpmovsxbq {{.*#+}} ymm2 = [1,0,5,4] +; AVX512VL-FAST-ALL-NEXT: vpermt2q %ymm1, %ymm2, %ymm0 +; AVX512VL-FAST-ALL-NEXT: retq +; +; AVX512VL-FAST-PERLANE-LABEL: shuffle_v4i64_1032_v2i64: +; AVX512VL-FAST-PERLANE: # %bb.0: +; AVX512VL-FAST-PERLANE-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX512VL-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX512VL-FAST-PERLANE-NEXT: retq %1 = shufflevector <2 x i64> %a, <2 x i64> poison, <2 x i32> %2 = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> %3 = shufflevector <2 x i64> %1, <2 x i64> %2, <4 x i32> diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll index f7c29cba30bd5..6b1d118ca97ad 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll @@ -6173,13 +6173,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512F-FAST: # %bb.0: ; AVX512F-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512F-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512F-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512F-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512F-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512F-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512F-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512F-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512F-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3] +; AVX512F-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512F-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512F-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512F-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; AVX512F-FAST-NEXT: vextracti64x4 $1, %zmm0, %ymm1 ; AVX512F-FAST-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 ; AVX512F-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 @@ -6209,13 +6209,13 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b ; AVX512BW-FAST: # %bb.0: ; AVX512BW-FAST-NEXT: vmovdqa (%rdi), %xmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] -; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero -; AVX512BW-FAST-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 -; AVX512BW-FAST-NEXT: vpermq {{.*#+}} zmm0 = zmm0[0,1,1,3,4,5,5,7] -; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm1 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] -; AVX512BW-FAST-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FAST-NEXT: vpandq %zmm1, %zmm0, %zmm0 +; AVX512BW-FAST-NEXT: vpmovzxbq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero +; AVX512BW-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,u,u,u,u,u,u,u,3,u,u,u,u,u,u,u] +; AVX512BW-FAST-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,9,9,0,0,1,1,3] +; AVX512BW-FAST-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 +; AVX512BW-FAST-NEXT: vbroadcasti32x4 {{.*#+}} zmm0 = [255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] +; AVX512BW-FAST-NEXT: # zmm0 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FAST-NEXT: vpandq %zmm0, %zmm2, %zmm0 ; AVX512BW-FAST-NEXT: vpaddb (%rdx), %zmm0, %zmm0 ; AVX512BW-FAST-NEXT: vmovdqa64 %zmm0, (%rcx) ; AVX512BW-FAST-NEXT: vzeroupper diff --git a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll index a3b065667702f..f706184f9727e 100644 --- a/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll +++ b/llvm/test/Transforms/FunctionAttrs/2009-01-02-LocalStores.ll @@ -14,7 +14,7 @@ define ptr @b(ptr %q) { ret ptr %tmp } -; CHECK: define ptr @c(ptr readnone returned captures(address_is_null, ret: address, provenance) %r) +; CHECK: define ptr @c(ptr readnone returned %r) @g = global i32 0 define ptr @c(ptr %r) { %a = icmp eq ptr %r, null diff --git a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll index 99406696d33d1..13954694eefe0 100644 --- a/llvm/test/Transforms/FunctionAttrs/arg_returned.ll +++ b/llvm/test/Transforms/FunctionAttrs/arg_returned.ll @@ -145,8 +145,8 @@ return: ; preds = %cond.end, %if.then3 ; TEST SCC test returning a pointer value argument ; -; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned captures(ret: address, provenance) %r) -; FNATTR: define ptr @ptr_scc_r1(ptr readnone %a, ptr readnone %r, ptr readnone captures(none) %b) +; FNATTR: define ptr @ptr_sink_r0(ptr readnone returned %r) +; FNATTR: define ptr @ptr_scc_r1(ptr %a, ptr readnone %r, ptr readnone captures(none) %b) ; FNATTR: define ptr @ptr_scc_r2(ptr readnone %a, ptr readnone %b, ptr readnone %r) ; ; @@ -260,8 +260,8 @@ entry: ; TEST another SCC test ; -; FNATTR: define ptr @rt2_helper(ptr readnone captures(address_is_null) %a) -; FNATTR: define ptr @rt2(ptr readnone captures(address_is_null) %a, ptr readnone captures(ret: address, provenance) %b) +; FNATTR: define ptr @rt2_helper(ptr %a) +; FNATTR: define ptr @rt2(ptr readnone %a, ptr readnone %b) define ptr @rt2_helper(ptr %a) #0 { entry: %call = call ptr @rt2(ptr %a, ptr %a) @@ -284,8 +284,8 @@ if.end: ; TEST another SCC test ; -; FNATTR: define ptr @rt3_helper(ptr readnone captures(address_is_null) %a, ptr readnone %b) -; FNATTR: define ptr @rt3(ptr readnone captures(address_is_null) %a, ptr readnone %b) +; FNATTR: define ptr @rt3_helper(ptr %a, ptr %b) +; FNATTR: define ptr @rt3(ptr readnone %a, ptr readnone %b) define ptr @rt3_helper(ptr %a, ptr %b) #0 { entry: %call = call ptr @rt3(ptr %a, ptr %b) @@ -316,7 +316,7 @@ if.end: ; } ; ; -; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned captures(ret: address, provenance) %r) +; FNATTR: define ptr @calls_unknown_fn(ptr readnone returned %r) declare void @unknown_fn(ptr) #0 define ptr @calls_unknown_fn(ptr %r) #0 { @@ -415,7 +415,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @bitcast(ptr readnone returned captures(ret: address, provenance) %b) +; FNATTR: define ptr @bitcast(ptr readnone returned %b) ; define ptr @bitcast(ptr %b) #0 { entry: @@ -433,7 +433,7 @@ entry: ; } ; ; -; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @bitcasts_select_and_phi(ptr readnone %b) ; define ptr @bitcasts_select_and_phi(ptr %b) #0 { entry: @@ -462,7 +462,7 @@ if.end: ; preds = %if.then, %entry ; } ; ; -; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_arg_arg_undef(ptr readnone %b) ; define ptr @ret_arg_arg_undef(ptr %b) #0 { entry: @@ -494,7 +494,7 @@ ret_undef: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_undef_arg_arg(ptr readnone %b) ; define ptr @ret_undef_arg_arg(ptr %b) #0 { entry: @@ -526,7 +526,7 @@ ret_arg1: ; } ; ; -; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone captures(address_is_null, ret: address, provenance) %b) +; FNATTR: define ptr @ret_undef_arg_undef(ptr readnone %b) define ptr @ret_undef_arg_undef(ptr %b) #0 { entry: %cmp = icmp eq ptr %b, null diff --git a/llvm/test/Transforms/FunctionAttrs/nocapture.ll b/llvm/test/Transforms/FunctionAttrs/nocapture.ll index 6debe5de3966e..6164f2adbf5b9 100644 --- a/llvm/test/Transforms/FunctionAttrs/nocapture.ll +++ b/llvm/test/Transforms/FunctionAttrs/nocapture.ll @@ -7,7 +7,7 @@ define ptr @c1(ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @c1 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[Q:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned [[Q:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[Q]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) @@ -512,7 +512,7 @@ define void @test4_1(ptr %x4_1, i1 %c) { define ptr @test4_2(ptr %x4_2, ptr %y4_2, ptr %z4_2, i1 %c) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @test4_2 -; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned captures(ret: address, provenance) [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { +; FNATTRS-SAME: (ptr readnone captures(none) [[X4_2:%.*]], ptr readnone returned [[Y4_2:%.*]], ptr readnone captures(none) [[Z4_2:%.*]], i1 [[C:%.*]]) #[[ATTR10]] { ; FNATTRS-NEXT: br i1 [[C]], label [[T:%.*]], label [[F:%.*]] ; FNATTRS: t: ; FNATTRS-NEXT: call void @test4_1(ptr null, i1 [[C]]) @@ -740,7 +740,7 @@ define void @captureStrip(ptr %p) { define i1 @captureICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmp -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -757,7 +757,7 @@ define i1 @captureICmp(ptr %x) { define i1 @captureICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @captureICmpRev -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr null, [[X]] ; FNATTRS-NEXT: ret i1 [[TMP1]] ; @@ -771,29 +771,10 @@ define i1 @captureICmpRev(ptr %x) { ret i1 %1 } -define i1 @captureICmpWrongPred(ptr %x) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @captureICmpWrongPred -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null -; FNATTRS-NEXT: ret i1 [[TMP1]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @captureICmpWrongPred -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp slt ptr [[X]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] -; - %1 = icmp slt ptr %x, null - ret i1 %1 -} - -; We could infer captures(address_is_null) here, but don't bother, because -; InstCombine will optimize the GEP away. define i1 @nocaptureInboundsGEPICmp(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmp -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -813,7 +794,7 @@ define i1 @nocaptureInboundsGEPICmp(ptr %x) { define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i1 @nocaptureInboundsGEPICmpRev -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[X:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 5 ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr null, [[TMP1]] ; FNATTRS-NEXT: ret i1 [[TMP2]] @@ -830,46 +811,6 @@ define i1 @nocaptureInboundsGEPICmpRev(ptr %x) { ret i1 %2 } -define i1 @notInboundsGEPICmp(ptr %x) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @notInboundsGEPICmp -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; FNATTRS-NEXT: ret i1 [[TMP2]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @notInboundsGEPICmp -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] -; - %1 = getelementptr i32, ptr %x, i32 5 - %2 = icmp eq ptr %1, null - ret i1 %2 -} - -define i1 @inboundsGEPICmpNullPointerDefined(ptr %x) null_pointer_is_valid { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) -; FNATTRS-LABEL: define i1 @inboundsGEPICmpNullPointerDefined -; FNATTRS-SAME: (ptr readnone captures(address) [[X:%.*]]) #[[ATTR16:[0-9]+]] { -; FNATTRS-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; FNATTRS-NEXT: ret i1 [[TMP2]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @inboundsGEPICmpNullPointerDefined -; ATTRIBUTOR-SAME: (ptr nofree readnone [[X:%.*]]) #[[ATTR12:[0-9]+]] { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[X]], i32 5 -; ATTRIBUTOR-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP1]], null -; ATTRIBUTOR-NEXT: ret i1 [[TMP2]] -; - %1 = getelementptr i32, ptr %x, i32 5 - %2 = icmp eq ptr %1, null - ret i1 %2 -} - define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @nocaptureDereferenceableOrNullICmp @@ -890,13 +831,13 @@ define i1 @nocaptureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) define i1 @captureDereferenceableOrNullICmp(ptr dereferenceable_or_null(4) %x) null_pointer_is_valid { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; FNATTRS-LABEL: define noundef i1 @captureDereferenceableOrNullICmp -; FNATTRS-SAME: (ptr readnone captures(address_is_null) dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16]] { +; FNATTRS-SAME: (ptr readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR16:[0-9]+]] { ; FNATTRS-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; FNATTRS-NEXT: ret i1 [[TMP1]] ; ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind null_pointer_is_valid willreturn memory(none) ; ATTRIBUTOR-LABEL: define i1 @captureDereferenceableOrNullICmp -; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12]] { +; ATTRIBUTOR-SAME: (ptr nofree readnone dereferenceable_or_null(4) [[X:%.*]]) #[[ATTR12:[0-9]+]] { ; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = icmp eq ptr [[X]], null ; ATTRIBUTOR-NEXT: ret i1 [[TMP1]] ; @@ -962,7 +903,7 @@ define void @readnone_indirec(ptr %f, ptr %p) { define ptr @captures_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define ptr @captures_ret_only -; FNATTRS-SAME: (ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: (ptr readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[P]], i64 8 ; FNATTRS-NEXT: ret ptr [[GEP]] ; @@ -976,8 +917,6 @@ define ptr @captures_ret_only(ptr %p) { ret ptr %gep } -; Even though the ptrtoint is only used in the return value, this should *not* -; be considered a read-only capture. define i64 @captures_not_ret_only(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define i64 @captures_not_ret_only @@ -996,52 +935,35 @@ define i64 @captures_not_ret_only(ptr %p) { } define void @captures_read_provenance(ptr %p) { -; FNATTRS-LABEL: define void @captures_read_provenance -; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { -; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @captures_read_provenance -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define void @captures_read_provenance +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) +; COMMON-NEXT: ret void ; call void @capture(ptr captures(address, read_provenance) %p) ret void } define void @captures_unused_ret(ptr %p) { -; FNATTRS-LABEL: define void @captures_unused_ret -; FNATTRS-SAME: (ptr captures(address_is_null) [[P:%.*]]) { -; FNATTRS-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @captures_unused_ret -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret void +; COMMON-LABEL: define void @captures_unused_ret +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: [[TMP1:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; COMMON-NEXT: ret void ; call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret void } define ptr @captures_used_ret(ptr %p) { -; FNATTRS-LABEL: define ptr @captures_used_ret -; FNATTRS-SAME: (ptr captures(address_is_null, ret: address, provenance) [[P:%.*]]) { -; FNATTRS-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; FNATTRS-NEXT: ret ptr [[RET]] -; -; ATTRIBUTOR-LABEL: define ptr @captures_used_ret -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: ret ptr [[RET]] +; COMMON-LABEL: define ptr @captures_used_ret +; COMMON-SAME: (ptr [[P:%.*]]) { +; COMMON-NEXT: [[RET:%.*]] = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) [[P]]) +; COMMON-NEXT: ret ptr [[RET]] ; %ret = call ptr @capture(ptr captures(address_is_null, ret: address, read_provenance) %p) ret ptr %ret } -; Make sure this is does not produce captures(ret: ...). We need to take the -; return capture components into account when handling argument SCCs. define ptr @scc_capture_via_ret(i1 %c, ptr %p) { ; FNATTRS: Function Attrs: nofree nosync nounwind memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define ptr @scc_capture_via_ret @@ -1077,72 +999,5 @@ else: ret ptr %p } -define i1 @improve_existing_captures(ptr captures(address) %p) { -; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; FNATTRS-LABEL: define i1 @improve_existing_captures -; FNATTRS-SAME: (ptr readnone captures(address_is_null) [[P:%.*]]) #[[ATTR0]] { -; FNATTRS-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null -; FNATTRS-NEXT: ret i1 [[CMP]] -; -; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) -; ATTRIBUTOR-LABEL: define i1 @improve_existing_captures -; ATTRIBUTOR-SAME: (ptr nofree readnone captures(address) [[P:%.*]]) #[[ATTR0]] { -; ATTRIBUTOR-NEXT: [[CMP:%.*]] = icmp eq ptr [[P]], null -; ATTRIBUTOR-NEXT: ret i1 [[CMP]] -; - %cmp = icmp eq ptr %p, null - ret i1 %cmp -} - -define void @dont_increase_existing_captures(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @capture(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @capture(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_trivial_scc(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures_trivial_scc -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; COMMON-NEXT: call void @dont_increase_existing_captures_trivial_scc(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @capture(ptr captures(address, read_provenance) %p) - call void @dont_increase_existing_captures_trivial_scc(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_scc1(ptr captures(address) %p) { -; COMMON-LABEL: define void @dont_increase_existing_captures_scc1 -; COMMON-SAME: (ptr captures(address) [[P:%.*]]) { -; COMMON-NEXT: call void @dont_increase_existing_captures_scc2(ptr [[P]]) -; COMMON-NEXT: ret void -; - call void @dont_increase_existing_captures_scc2(ptr %p) - ret void -} - -define void @dont_increase_existing_captures_scc2(ptr %p) { -; FNATTRS-LABEL: define void @dont_increase_existing_captures_scc2 -; FNATTRS-SAME: (ptr captures(address, read_provenance) [[P:%.*]]) { -; FNATTRS-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; FNATTRS-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) -; FNATTRS-NEXT: ret void -; -; ATTRIBUTOR-LABEL: define void @dont_increase_existing_captures_scc2 -; ATTRIBUTOR-SAME: (ptr [[P:%.*]]) { -; ATTRIBUTOR-NEXT: call void @capture(ptr captures(address, read_provenance) [[P]]) -; ATTRIBUTOR-NEXT: call void @dont_increase_existing_captures_scc1(ptr [[P]]) -; ATTRIBUTOR-NEXT: ret void -; - call void @capture(ptr captures(address, read_provenance) %p) - call void @dont_increase_existing_captures_scc1(ptr %p) - ret void -} - declare ptr @llvm.launder.invariant.group.p0(ptr) declare ptr @llvm.strip.invariant.group.p0(ptr) diff --git a/llvm/test/Transforms/FunctionAttrs/nonnull.ll b/llvm/test/Transforms/FunctionAttrs/nonnull.ll index 94093568419af..0f6762f0d4342 100644 --- a/llvm/test/Transforms/FunctionAttrs/nonnull.ll +++ b/llvm/test/Transforms/FunctionAttrs/nonnull.ll @@ -19,7 +19,7 @@ define ptr @test1() { ; Return a pointer trivially nonnull (argument attribute) define ptr @test2(ptr nonnull %p) { ; FNATTRS-LABEL: define nonnull ptr @test2( -; FNATTRS-SAME: ptr nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: ptr nonnull readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: ret ptr [[P]] ; ; ATTRIBUTOR-LABEL: define nonnull ptr @test2( @@ -194,7 +194,7 @@ exit: define ptr @test7(ptr %a) { ; FNATTRS-LABEL: define ptr @test7( -; FNATTRS-SAME: ptr readnone returned captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone returned [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr [[A]] ; ; ATTRIBUTOR-LABEL: define ptr @test7( @@ -206,7 +206,7 @@ define ptr @test7(ptr %a) { define ptr @test8(ptr %a) { ; FNATTRS-LABEL: define nonnull ptr @test8( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 1 ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -221,7 +221,7 @@ define ptr @test8(ptr %a) { define ptr @test9(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test9( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] ; FNATTRS-NEXT: ret ptr [[B]] ; @@ -238,7 +238,7 @@ declare void @llvm.assume(i1) ; FIXME: missing nonnull define ptr @test10(ptr %a, i64 %n) { ; FNATTRS-LABEL: define ptr @test10( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { +; FNATTRS-SAME: ptr readnone [[A:%.*]], i64 [[N:%.*]]) #[[ATTR3:[0-9]+]] { ; FNATTRS-NEXT: [[CMP:%.*]] = icmp ne i64 [[N]], 0 ; FNATTRS-NEXT: call void @llvm.assume(i1 [[CMP]]) ; FNATTRS-NEXT: [[B:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[N]] @@ -263,7 +263,7 @@ define ptr @test10(ptr %a, i64 %n) { ; } define ptr @test11(ptr) local_unnamed_addr { ; FNATTRS-LABEL: define nonnull ptr @test11( -; FNATTRS-SAME: ptr readnone captures(address_is_null, ret: address, provenance) [[TMP0:%.*]]) local_unnamed_addr { +; FNATTRS-SAME: ptr readnone [[TMP0:%.*]]) local_unnamed_addr { ; FNATTRS-NEXT: [[TMP2:%.*]] = icmp eq ptr [[TMP0]], null ; FNATTRS-NEXT: br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP5:%.*]] ; FNATTRS: 3: @@ -362,7 +362,7 @@ declare nonnull ptr @nonnull() define internal ptr @f1(ptr %arg) { ; FIXME: missing nonnull It should be nonnull @f1(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f1( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { +; FNATTRS-SAME: ptr readonly [[ARG:%.*]]) #[[ATTR4:[0-9]+]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = icmp eq ptr [[ARG]], null ; FNATTRS-NEXT: br i1 [[TMP]], label [[BB9:%.*]], label [[BB1:%.*]] @@ -431,7 +431,7 @@ bb9: ; preds = %bb4, %bb define internal ptr @f2(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f2(ptr nonnull %arg) ; FNATTRS-LABEL: define internal nonnull ptr @f2( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = tail call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -452,7 +452,7 @@ bb: define dso_local noalias ptr @f3(ptr %arg) { ; FIXME: missing nonnull. It should be nonnull @f3(ptr nonnull readonly %arg) ; FNATTRS-LABEL: define dso_local noalias nonnull ptr @f3( -; FNATTRS-SAME: ptr readonly captures(address_is_null) [[ARG:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: ptr [[ARG:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: bb: ; FNATTRS-NEXT: [[TMP:%.*]] = call ptr @f1(ptr [[ARG]]) ; FNATTRS-NEXT: ret ptr [[TMP]] @@ -945,7 +945,7 @@ exc: define ptr @gep1(ptr %p) { ; FNATTRS-LABEL: define nonnull ptr @gep1( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -961,7 +961,7 @@ define ptr @gep1(ptr %p) { define ptr @gep1_no_null_opt(ptr %p) #0 { ; Should't be able to derive nonnull based on gep. ; FNATTRS-LABEL: define ptr @gep1_no_null_opt( -; FNATTRS-SAME: ptr readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR8:[0-9]+]] { +; FNATTRS-SAME: ptr readnone [[P:%.*]]) #[[ATTR8:[0-9]+]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr [[P]], i32 1 ; FNATTRS-NEXT: ret ptr [[Q]] ; @@ -976,7 +976,7 @@ define ptr @gep1_no_null_opt(ptr %p) #0 { define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FNATTRS-LABEL: define ptr addrspace(3) @gep2( -; FNATTRS-SAME: ptr addrspace(3) readnone captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[Q:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[P]], i32 1 ; FNATTRS-NEXT: ret ptr addrspace(3) [[Q]] ; @@ -992,7 +992,7 @@ define ptr addrspace(3) @gep2(ptr addrspace(3) %p) { ; FIXME: We should propagate dereferenceable here but *not* nonnull define ptr addrspace(3) @as(ptr addrspace(3) dereferenceable(4) %p) { ; FNATTRS-LABEL: define noundef ptr addrspace(3) @as( -; FNATTRS-SAME: ptr addrspace(3) readnone returned captures(ret: address, provenance) dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr addrspace(3) readnone returned dereferenceable(4) [[P:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: ret ptr addrspace(3) [[P]] ; ; ATTRIBUTOR-LABEL: define ptr addrspace(3) @as( @@ -1383,7 +1383,7 @@ define void @PR43833_simple(ptr %0, i32 %1) { define ptr @pr91177_non_inbounds_gep(ptr nonnull %arg) { ; FNATTRS-LABEL: define ptr @pr91177_non_inbounds_gep( -; FNATTRS-SAME: ptr nonnull readnone captures(ret: address, provenance) [[ARG:%.*]]) #[[ATTR0]] { +; FNATTRS-SAME: ptr nonnull readnone [[ARG:%.*]]) #[[ATTR0]] { ; FNATTRS-NEXT: [[RES:%.*]] = getelementptr i8, ptr [[ARG]], i64 -8 ; FNATTRS-NEXT: ret ptr [[RES]] ; diff --git a/llvm/test/Transforms/FunctionAttrs/noundef.ll b/llvm/test/Transforms/FunctionAttrs/noundef.ll index 4f53c08804621..b7c583880501a 100644 --- a/llvm/test/Transforms/FunctionAttrs/noundef.ll +++ b/llvm/test/Transforms/FunctionAttrs/noundef.ll @@ -169,7 +169,7 @@ define i64 @test_trunc_with_constexpr() { define align 4 ptr @maybe_not_aligned(ptr noundef %p) { ; CHECK-LABEL: define align 4 ptr @maybe_not_aligned( -; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -177,7 +177,7 @@ define align 4 ptr @maybe_not_aligned(ptr noundef %p) { define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { ; CHECK-LABEL: define noundef align 4 ptr @definitely_aligned( -; CHECK-SAME: ptr noundef readnone returned align 4 captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned align 4 [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -185,7 +185,7 @@ define align 4 ptr @definitely_aligned(ptr noundef align 4 %p) { define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { ; CHECK-LABEL: define nonnull ptr @maybe_not_nonnull( -; CHECK-SAME: ptr noundef readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p @@ -193,7 +193,7 @@ define nonnull ptr @maybe_not_nonnull(ptr noundef %p) { define nonnull ptr @definitely_nonnull(ptr noundef nonnull %p) { ; CHECK-LABEL: define noundef nonnull ptr @definitely_nonnull( -; CHECK-SAME: ptr noundef nonnull readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0]] { +; CHECK-SAME: ptr noundef nonnull readnone returned [[P:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: ret ptr [[P]] ; ret ptr %p diff --git a/llvm/test/Transforms/FunctionAttrs/readattrs.ll b/llvm/test/Transforms/FunctionAttrs/readattrs.ll index 5fc88d623c0ec..b24c097ad54d0 100644 --- a/llvm/test/Transforms/FunctionAttrs/readattrs.ll +++ b/llvm/test/Transforms/FunctionAttrs/readattrs.ll @@ -35,7 +35,7 @@ define void @test1_2(ptr %x1_2, ptr %y1_2, ptr %z1_2) { define ptr @test2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(write, argmem: none, inaccessiblemem: none) ; FNATTRS-LABEL: define {{[^@]+}}@test2 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR0:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; FNATTRS-NEXT: store i32 0, ptr @x, align 4 ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -58,7 +58,7 @@ define ptr @test2(ptr %p) { define i1 @test3(ptr %p, ptr %q) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test3 -; FNATTRS-SAME: (ptr readnone captures(address) [[P:%.*]], ptr readnone captures(address) [[Q:%.*]]) #[[ATTR1:[0-9]+]] { +; FNATTRS-SAME: (ptr readnone [[P:%.*]], ptr readnone [[Q:%.*]]) #[[ATTR1:[0-9]+]] { ; FNATTRS-NEXT: [[A:%.*]] = icmp ult ptr [[P]], [[Q]] ; FNATTRS-NEXT: ret i1 [[A]] ; @@ -197,7 +197,7 @@ define void @test7_2(ptr preallocated(i32) %a) { define ptr @test8_1(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none) ; FNATTRS-LABEL: define {{[^@]+}}@test8_1 -; FNATTRS-SAME: (ptr readnone returned captures(ret: address, provenance) [[P:%.*]]) #[[ATTR1]] { +; FNATTRS-SAME: (ptr readnone returned [[P:%.*]]) #[[ATTR1]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: ret ptr [[P]] ; @@ -220,7 +220,7 @@ entry: define void @test8_2(ptr %p) { ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: write) ; FNATTRS-LABEL: define {{[^@]+}}@test8_2 -; FNATTRS-SAME: (ptr writeonly captures(none) [[P:%.*]]) #[[ATTR4]] { +; FNATTRS-SAME: (ptr writeonly [[P:%.*]]) #[[ATTR4]] { ; FNATTRS-NEXT: entry: ; FNATTRS-NEXT: [[CALL:%.*]] = call ptr @test8_1(ptr [[P]]) ; FNATTRS-NEXT: store i32 10, ptr [[CALL]], align 4 diff --git a/llvm/test/Transforms/FunctionAttrs/stats.ll b/llvm/test/Transforms/FunctionAttrs/stats.ll index dc0387e57174a..5f007b4078ff3 100644 --- a/llvm/test/Transforms/FunctionAttrs/stats.ll +++ b/llvm/test/Transforms/FunctionAttrs/stats.ll @@ -16,8 +16,8 @@ entry: ret void } -; CHECK: 1 function-attrs - Number of arguments marked captures(none) -; CHECK-NEXT: 2 function-attrs - Number of functions with improved memory attribute +; CHECK: 2 function-attrs - Number of functions with improved memory attribute +; CHECK-NEXT: 1 function-attrs - Number of arguments marked nocapture ; CHECK-NEXT: 1 function-attrs - Number of functions marked as nofree ; CHECK-NEXT: 2 function-attrs - Number of functions marked as norecurse ; CHECK-NEXT: 2 function-attrs - Number of functions marked as nosync diff --git a/llvm/test/Transforms/InstCombine/select-icmp-and.ll b/llvm/test/Transforms/InstCombine/select-icmp-and.ll index 1218799ab3dc5..e49c2f6214114 100644 --- a/llvm/test/Transforms/InstCombine/select-icmp-and.ll +++ b/llvm/test/Transforms/InstCombine/select-icmp-and.ll @@ -391,9 +391,8 @@ define i32 @test15e_extra_use(i32 %X) { ;; (a & 128) ? 256 : 0 define i32 @test15e_zext(i8 %X) { ; CHECK-LABEL: @test15e_zext( -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X:%.*]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[T2_NOT:%.*]] = icmp sgt i8 [[X:%.*]], -1 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2_NOT]], i32 0, i32 256 ; CHECK-NEXT: ret i32 [[T3]] ; %t1 = and i8 %X, 128 @@ -406,9 +405,7 @@ define i32 @test15e_zext(i8 %X) { define i32 @test15e_zext_extra_use(i8 %X) { ; CHECK-LABEL: @test15e_zext_extra_use( ; CHECK-NEXT: [[T2:%.*]] = icmp slt i8 [[X:%.*]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[X]], -128 -; CHECK-NEXT: [[TMP2:%.*]] = zext i8 [[TMP1]] to i32 -; CHECK-NEXT: [[T3:%.*]] = shl nuw nsw i32 [[TMP2]], 1 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 256, i32 0 ; CHECK-NEXT: call void @use1(i1 [[T2]]) ; CHECK-NEXT: ret i32 [[T3]] ; @@ -438,8 +435,7 @@ define i32 @test15f_extra_use(i32 %X) { ; CHECK-LABEL: @test15f_extra_use( ; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 128 ; CHECK-NEXT: [[T2:%.*]] = icmp ne i32 [[T1]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i32 [[T1]], 1 -; CHECK-NEXT: [[T3:%.*]] = xor i32 [[TMP1]], 256 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2]], i32 0, i32 256 ; CHECK-NEXT: call void @use1(i1 [[T2]]) ; CHECK-NEXT: ret i32 [[T3]] ; @@ -453,10 +449,9 @@ define i32 @test15f_extra_use(i32 %X) { ;; (a & 128) ? 0 : 256 define i16 @test15f_trunc(i32 %X) { ; CHECK-LABEL: @test15f_trunc( -; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[X:%.*]] to i16 -; CHECK-NEXT: [[TMP2:%.*]] = shl i16 [[TMP1]], 1 -; CHECK-NEXT: [[TMP3:%.*]] = and i16 [[TMP2]], 256 -; CHECK-NEXT: [[T3:%.*]] = xor i16 [[TMP3]], 256 +; CHECK-NEXT: [[T1:%.*]] = and i32 [[X:%.*]], 128 +; CHECK-NEXT: [[T2_NOT:%.*]] = icmp eq i32 [[T1]], 0 +; CHECK-NEXT: [[T3:%.*]] = select i1 [[T2_NOT]], i16 256, i16 0 ; CHECK-NEXT: ret i16 [[T3]] ; %t1 = and i32 %X, 128 @@ -799,7 +794,9 @@ define i8 @select_bittest_to_xor(i8 %x) { ; CHECK-LABEL: @select_bittest_to_xor( ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], -1 ; CHECK-NEXT: call void @use1(i1 [[CMP]]) -; CHECK-NEXT: [[MASKSEL:%.*]] = xor i8 [[X]], -128 +; CHECK-NEXT: [[AND:%.*]] = and i8 [[X]], 127 +; CHECK-NEXT: [[MASKSEL1:%.*]] = select i1 [[CMP]], i8 -128, i8 0 +; CHECK-NEXT: [[MASKSEL:%.*]] = or disjoint i8 [[AND]], [[MASKSEL1]] ; CHECK-NEXT: ret i8 [[MASKSEL]] ; %cmp = icmp sgt i8 %x, -1 diff --git a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll index 67dec9178eeca..ca2e23c1d082e 100644 --- a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll +++ b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll @@ -1754,9 +1754,9 @@ define i8 @select_icmp_eq_and_1_0_lshr_tv(i8 %x, i8 %y) { define i8 @select_trunc_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc i8 %x to i1 @@ -1767,9 +1767,9 @@ define i8 @select_trunc_or_2(i8 %x, i8 %y) { define i8 @select_not_trunc_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_not_trunc_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc i8 %x to i1 @@ -1781,9 +1781,8 @@ define i8 @select_not_trunc_or_2(i8 %x, i8 %y) { define i8 @select_trunc_nuw_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_nuw_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc nuw i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP1]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc nuw i8 %x to i1 @@ -1794,9 +1793,9 @@ define i8 @select_trunc_nuw_or_2(i8 %x, i8 %y) { define i8 @select_trunc_nsw_or_2(i8 %x, i8 %y) { ; CHECK-LABEL: @select_trunc_nsw_or_2( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc nsw i8 [[X:%.*]] to i1 -; CHECK-NEXT: [[OR:%.*]] = or i8 [[Y:%.*]], 2 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[TRUNC]], i8 [[OR]], i8 [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = and i8 [[TMP1]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or i8 [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret i8 [[SELECT]] ; %trunc = trunc nsw i8 %x to i1 @@ -1807,9 +1806,9 @@ define i8 @select_trunc_nsw_or_2(i8 %x, i8 %y) { define <2 x i8> @select_trunc_or_2_vec(<2 x i8> %x, <2 x i8> %y) { ; CHECK-LABEL: @select_trunc_or_2_vec( -; CHECK-NEXT: [[TRUNC:%.*]] = trunc <2 x i8> [[X:%.*]] to <2 x i1> -; CHECK-NEXT: [[OR:%.*]] = or <2 x i8> [[Y:%.*]], splat (i8 2) -; CHECK-NEXT: [[SELECT:%.*]] = select <2 x i1> [[TRUNC]], <2 x i8> [[OR]], <2 x i8> [[Y]] +; CHECK-NEXT: [[TMP1:%.*]] = shl <2 x i8> [[X:%.*]], splat (i8 1) +; CHECK-NEXT: [[TMP2:%.*]] = and <2 x i8> [[TMP1]], splat (i8 2) +; CHECK-NEXT: [[SELECT:%.*]] = or <2 x i8> [[Y:%.*]], [[TMP2]] ; CHECK-NEXT: ret <2 x i8> [[SELECT]] ; %trunc = trunc <2 x i8> %x to <2 x i1> diff --git a/llvm/test/Transforms/LoopVectorize/loop-form.ll b/llvm/test/Transforms/LoopVectorize/loop-form.ll index 730d488119d13..4a9380b3f35e8 100644 --- a/llvm/test/Transforms/LoopVectorize/loop-form.ll +++ b/llvm/test/Transforms/LoopVectorize/loop-form.ll @@ -1321,3 +1321,93 @@ exit: ret i32 %accum } +define i16 @multiple_exit_none_via_latch(ptr %dst, i64 %x) { +; CHECK-LABEL: @multiple_exit_none_via_latch( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[X:%.*]], i64 100) +; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[UMIN]], 1 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP0]], 2 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 2, i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[TMP2]] +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP4]] +; CHECK-NEXT: store i64 0, ptr [[TMP5]], align 8 +; CHECK-NEXT: store i64 0, ptr [[TMP6]], align 8 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[LOOP_HEADER:%.*]] +; CHECK: loop.header: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i64 0, ptr [[GEP]], align 8 +; CHECK-NEXT: [[CMP120:%.*]] = icmp slt i64 [[IV]], 100 +; CHECK-NEXT: br i1 [[CMP120]], label [[LOOP_THEN:%.*]], label [[EXIT_2:%.*]] +; CHECK: loop.then: +; CHECK-NEXT: [[CMP3:%.*]] = icmp ne i64 [[IV]], [[X]] +; CHECK-NEXT: br i1 [[CMP3]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]] +; CHECK: loop.latch: +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: br label [[LOOP_HEADER]], !llvm.loop [[LOOP25:![0-9]+]] +; CHECK: exit.1: +; CHECK-NEXT: ret i16 0 +; CHECK: exit.2: +; CHECK-NEXT: ret i16 1 +; +; TAILFOLD-LABEL: @multiple_exit_none_via_latch( +; TAILFOLD-NEXT: entry: +; TAILFOLD-NEXT: br label [[LOOP_HEADER:%.*]] +; TAILFOLD: loop.header: +; TAILFOLD-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ] +; TAILFOLD-NEXT: [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[IV]] +; TAILFOLD-NEXT: store i64 0, ptr [[GEP]], align 8 +; TAILFOLD-NEXT: [[CMP120:%.*]] = icmp slt i64 [[IV]], 100 +; TAILFOLD-NEXT: br i1 [[CMP120]], label [[LOOP_THEN:%.*]], label [[EXIT_2:%.*]] +; TAILFOLD: loop.then: +; TAILFOLD-NEXT: [[CMP3:%.*]] = icmp ne i64 [[IV]], [[X:%.*]] +; TAILFOLD-NEXT: br i1 [[CMP3]], label [[LOOP_LATCH]], label [[EXIT_1:%.*]] +; TAILFOLD: loop.latch: +; TAILFOLD-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; TAILFOLD-NEXT: br label [[LOOP_HEADER]] +; TAILFOLD: exit.1: +; TAILFOLD-NEXT: ret i16 0 +; TAILFOLD: exit.2: +; TAILFOLD-NEXT: ret i16 1 +; +entry: + br label %loop.header + +loop.header: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ] + %gep = getelementptr inbounds i32, ptr %dst, i64 %iv + store i64 0, ptr %gep + %cmp120 = icmp slt i64 %iv, 100 + br i1 %cmp120, label %loop.then, label %exit.2 + +loop.then: + %cmp3 = icmp ne i64 %iv, %x + br i1 %cmp3, label %loop.latch, label %exit.1 + +loop.latch: + %iv.next = add i64 %iv, 1 + br label %loop.header + +exit.1: + ret i16 0 + +exit.2: + ret i16 1 +} diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll index 7175816963ed1..e01dba328a3a1 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/block_scaling_decompr_8bit.ll @@ -9,7 +9,7 @@ target triple = "aarch64" define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_(i32 noundef %n_prb, ptr noundef %src, ptr noundef %dst, ptr noundef %scale) #0 { ; CHECK-LABEL: define dso_local noundef i32 @_Z33block_scaling_decompr_8bitjPK27compressed_data_8bitP20cmplx_int16_tPKS2_( -; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(address_is_null) [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { +; CHECK-SAME: i32 noundef [[N_PRB:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly [[SCALE:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP47_NOT:%.*]] = icmp eq i32 [[N_PRB]], 0 ; CHECK-NEXT: br i1 [[CMP47_NOT]], label %[[FOR_END:.*]], label %[[FOR_BODY_LR_PH:.*]] diff --git a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll index d5edf83ee52e2..bbd4849c32296 100644 --- a/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll +++ b/llvm/test/Transforms/PhaseOrdering/bitcast-store-branch.ll @@ -12,7 +12,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, half %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], half [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll index 4b422f205138a..ee7698b116aa2 100644 --- a/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll +++ b/llvm/test/Transforms/PhaseOrdering/dce-after-argument-promotion-loads.ll @@ -14,7 +14,7 @@ entry: define ptr @parent(ptr align 8 dereferenceable(72) %f, i16 %val1, i16 %val2, i32 %val3) align 2 { ; CHECK-LABEL: define noundef nonnull ptr @parent -; CHECK-SAME: (ptr readonly returned align 8 captures(ret: address, provenance) dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { +; CHECK-SAME: (ptr readonly returned align 8 dereferenceable(72) [[F:%.*]], i16 [[VAL1:%.*]], i16 [[VAL2:%.*]], i32 [[VAL3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[F]], i64 64 ; CHECK-NEXT: [[F_VAL:%.*]] = load ptr, ptr [[TMP0]], align 8 diff --git a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll index cd2ed37b22db5..5f75bd788e4bb 100644 --- a/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll +++ b/llvm/test/Transforms/PhaseOrdering/enable-loop-header-duplication-oz.ll @@ -11,7 +11,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-LABEL: define void @test( -; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; NOROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; NOROTATION-NEXT: entry: ; NOROTATION-NEXT: br label [[LOOP_HEADER:%.*]] ; NOROTATION: loop.header: @@ -26,7 +26,7 @@ define void @test(i8* noalias nonnull align 1 %start, i8* %end) unnamed_addr { ; NOROTATION-NEXT: ret void ; ; ROTATION-LABEL: define void @test( -; ROTATION-SAME: ptr noalias nonnull writeonly align 1 captures(address) [[START:%.*]], ptr readnone captures(address) [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { +; ROTATION-SAME: ptr noalias nonnull writeonly align 1 [[START:%.*]], ptr readnone [[END:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] { ; ROTATION-NEXT: entry: ; ROTATION-NEXT: [[_12_I1:%.*]] = icmp eq ptr [[START]], [[END]] ; ROTATION-NEXT: br i1 [[_12_I1]], label [[EXIT:%.*]], label [[LOOP_LATCH_PREHEADER:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll index c029781142af3..ae851e3319e1f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_7zip.ll @@ -13,17 +13,18 @@ define fastcc void @LzmaDec_DecodeReal2(ptr %p, i1 %arg) { ; CHECK-NEXT: [[RANGE20_I:%.*]] = getelementptr inbounds [[STRUCT_CLZMADEC_1_28_55_82_103_124_145_166_181_196_229_259_334:%.*]], ptr [[P:%.*]], i64 0, i32 4 ; CHECK-NEXT: br label [[DO_BODY66_I:%.*]] ; CHECK: do.body66.i: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[DO_COND_I:%.*]] ], [ undef, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP0]] -; CHECK-NEXT: br i1 %arg, label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP3:%.*]], [[DO_COND_I:%.*]] ], [ zeroinitializer, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP1:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[DO_COND_I]], label [[IF_ELSE_I:%.*]] ; CHECK: if.else.i: -; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP1]], undef +; CHECK-NEXT: [[TMP2:%.*]] = sub <2 x i32> [[TMP1]], zeroinitializer ; CHECK-NEXT: br label [[DO_COND_I]] ; CHECK: do.cond.i: -; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ [[TMP2]], [[IF_ELSE_I]] ], [ [[TMP1]], [[DO_BODY66_I]] ] -; CHECK-NEXT: br i1 %arg, label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] +; CHECK-NEXT: [[TMP3]] = phi <2 x i32> [ [[TMP2]], [[IF_ELSE_I]] ], [ [[TMP5]], [[DO_BODY66_I]] ] +; CHECK-NEXT: br i1 [[ARG]], label [[DO_BODY66_I]], label [[DO_END1006_I:%.*]] ; CHECK: do.end1006.i: -; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> undef, <2 x i32> undef, <2 x i32> [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> zeroinitializer, <2 x i32> zeroinitializer, <2 x i32> [[TMP3]] ; CHECK-NEXT: store <2 x i32> [[TMP4]], ptr [[RANGE20_I]], align 4 ; CHECK-NEXT: ret void ; @@ -33,25 +34,25 @@ entry: br label %do.body66.i do.body66.i: ; preds = %do.cond.i, %entry - %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ undef, %entry ] - %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ undef, %entry ] - %.range.2.i = select i1 undef, i32 undef, i32 %range.2.i - %.code.2.i = select i1 undef, i32 undef, i32 %code.2.i + %range.2.i = phi i32 [ %range.4.i, %do.cond.i ], [ zeroinitializer, %entry ] + %code.2.i = phi i32 [ %code.4.i, %do.cond.i ], [ zeroinitializer, %entry ] + %.range.2.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %range.2.i + %.code.2.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %code.2.i br i1 %arg, label %do.cond.i, label %if.else.i if.else.i: ; preds = %do.body66.i - %sub91.i = sub i32 %.range.2.i, undef - %sub92.i = sub i32 %.code.2.i, undef + %sub91.i = sub i32 %.range.2.i, zeroinitializer + %sub92.i = sub i32 %.code.2.i, zeroinitializer br label %do.cond.i do.cond.i: ; preds = %if.else.i, %do.body66.i - %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ undef, %do.body66.i ] + %range.4.i = phi i32 [ %sub91.i, %if.else.i ], [ zeroinitializer, %do.body66.i ] %code.4.i = phi i32 [ %sub92.i, %if.else.i ], [ %.code.2.i, %do.body66.i ] br i1 %arg, label %do.body66.i, label %do.end1006.i do.end1006.i: ; preds = %do.cond.i - %.range.4.i = select i1 undef, i32 undef, i32 %range.4.i - %.code.4.i = select i1 undef, i32 undef, i32 %code.4.i + %.range.4.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %range.4.i + %.code.4.i = select i1 zeroinitializer, i32 zeroinitializer, i32 %code.4.i store i32 %.range.4.i, ptr %range20.i, align 4 store i32 %.code.4.i, ptr %code21.i, align 4 ret void diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test b/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test index 3e39591c46dce..81ca701e78a49 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test +++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics-dwo.test @@ -72,6 +72,7 @@ RUN: llvm-dwarfdump --statistics statistics-fib.split-dwarf.o | FileCheck %s CHECK: "version": 9, CHECK: "#functions": 3, CHECK: "#functions with location": 3, +CHECK: "#out-of-line functions": 3, CHECK: "#inlined functions": 7, CHECK: "#inlined functions with abstract origins": 7, CHECK: "#unique source variables": 9, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test b/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test index 855dcedc76f0b..82939c77e25d4 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test +++ b/llvm/test/tools/llvm-dwarfdump/X86/statistics-v3.test @@ -67,6 +67,7 @@ RUN: llvm-dwarfdump --statistics %t-statistics-fib.o | FileCheck %s CHECK: "version": 9, CHECK: "#functions": 3, CHECK: "#functions with location": 3, +CHECK: "#out-of-line functions": 3, CHECK: "#inlined functions": 8, CHECK: "#inlined functions with abstract origins": 8, CHECK: "#unique source variables": 9, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll index 05626e60ca0c7..97482e9c9b858 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-multi-cu.ll @@ -4,6 +4,7 @@ ; Test that abstract origins in multiple CUs are uniqued. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 3, ; CHECK: "#inlined functions": 2, ; CHECK: "#unique source variables": 4, ; CHECK-NEXT: "#source variables": 6, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll index 3e4feca06d56f..25f81f31d18ac 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-inlining-single-cu.ll @@ -5,6 +5,7 @@ ; The results for both tests should be identical. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 3, ; CHECK: "#inlined functions": 2, ; CHECK: "#unique source variables": 4, ; CHECK-NEXT: "#source variables": 6, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll index 85f66f492ff78..6fd3b84fdc19a 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-mulitple-cu-out-of-line.ll @@ -20,6 +20,7 @@ ; CHECK: "#functions": 3, ; CHECK-NEXT: "#functions with location": 3, +; CHECK-NEXT: "#out-of-line functions": 4, ; CHECK-NEXT: "#inlined functions": 0, ; CHECK-NEXT: "#inlined functions with abstract origins": 0, ; CHECK-NEXT: "#unique source variables": 1, diff --git a/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll b/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll index 2f1e1e15aa3a9..60ca52a274375 100644 --- a/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll +++ b/llvm/test/tools/llvm-dwarfdump/X86/stats-multiple-cu-same-name.ll @@ -4,6 +4,7 @@ ; Test that statistics distinguish functions with the same name. ; CHECK: "#functions": 4, +; CHECK: "#out-of-line functions": 4, ; CHECK: "#unique source variables": 2, ; CHECK-NEXT: "#source variables": 2, diff --git a/llvm/tools/llvm-dwarfdump/Statistics.cpp b/llvm/tools/llvm-dwarfdump/Statistics.cpp index 6f2919318a6d5..1670709c08314 100644 --- a/llvm/tools/llvm-dwarfdump/Statistics.cpp +++ b/llvm/tools/llvm-dwarfdump/Statistics.cpp @@ -971,6 +971,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, SaturatingUINT64 VarParamUnique = 0; SaturatingUINT64 VarParamWithLoc = 0; SaturatingUINT64 NumFunctions = 0; + SaturatingUINT64 NumOutOfLineFunctions = 0; SaturatingUINT64 NumInlinedFunctions = 0; SaturatingUINT64 NumFuncsWithSrcLoc = 0; SaturatingUINT64 NumAbstractOrigins = 0; @@ -999,6 +1000,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, << Entry.getKey() << ": " << V.getKey() << "\n"); NumFunctions += Stats.IsFunction; NumFuncsWithSrcLoc += Stats.HasSourceLocation; + NumOutOfLineFunctions += Stats.IsFunction * Stats.NumFnOutOfLine; NumInlinedFunctions += Stats.IsFunction * Stats.NumFnInlined; NumAbstractOrigins += Stats.IsFunction * Stats.NumAbstractOrigins; ParamTotal += Stats.NumParams; @@ -1024,6 +1026,7 @@ bool dwarfdump::collectStatsForObjectFile(ObjectFile &Obj, DWARFContext &DICtx, printDatum(J, "#functions", NumFunctions.Value); printDatum(J, "#functions with location", NumFuncsWithSrcLoc.Value); + printDatum(J, "#out-of-line functions", NumOutOfLineFunctions.Value); printDatum(J, "#inlined functions", NumInlinedFunctions.Value); printDatum(J, "#inlined functions with abstract origins", NumAbstractOrigins.Value); diff --git a/llvm/tools/llvm-readobj/ELFDumper.cpp b/llvm/tools/llvm-readobj/ELFDumper.cpp index 2da08127f20a8..fdae09ac767e6 100644 --- a/llvm/tools/llvm-readobj/ELFDumper.cpp +++ b/llvm/tools/llvm-readobj/ELFDumper.cpp @@ -1624,8 +1624,6 @@ const EnumEntry ElfHeaderMipsFlags[] = { ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX909, "gfx909"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90A, "gfx90a"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX90C, "gfx90c"), \ - ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX940, "gfx940"), \ - ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX941, "gfx941"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX942, "gfx942"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX950, "gfx950"), \ ENUM_ENT(EF_AMDGPU_MACH_AMDGCN_GFX1010, "gfx1010"), \ diff --git a/llvm/unittests/Analysis/CaptureTrackingTest.cpp b/llvm/unittests/Analysis/CaptureTrackingTest.cpp index 3f5c10d935167..73dd82fb921f7 100644 --- a/llvm/unittests/Analysis/CaptureTrackingTest.cpp +++ b/llvm/unittests/Analysis/CaptureTrackingTest.cpp @@ -77,9 +77,9 @@ TEST(CaptureTracking, MaxUsesToExplore) { struct CollectingCaptureTracker : public CaptureTracker { SmallVector Captures; void tooManyUses() override { } - Action captured(const Use *U, UseCaptureInfo CI) override { + bool captured(const Use *U) override { Captures.push_back(U); - return Continue; + return false; } }; diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp index 0932938b209a4..22181ce33f0da 100644 --- a/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp +++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerHelperTest.cpp @@ -2060,7 +2060,7 @@ TEST_F(AArch64GISelMITest, LibcallFPExt) { auto CheckStr = R"( CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC CHECK: $h0 = COPY [[TRUNC]] - CHECK: BL &__gnu_h2f_ieee + CHECK: BL &__extendhfsf2 CHECK: $d0 = COPY CHECK: BL &__extenddftf2 )"; @@ -2103,7 +2103,7 @@ TEST_F(AArch64GISelMITest, LibcallFPTrunc) { auto CheckStr = R"( CHECK: [[TRUNC:%[0-9]+]]:_(s32) = G_TRUNC CHECK: $s0 = COPY [[TRUNC]] - CHECK: BL &__gnu_f2h_ieee + CHECK: BL &__truncsfhf2 CHECK: $q0 = COPY CHECK: BL &__trunctfdf2 )"; diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp index 93ac7381b02ef..5d771a1a153f7 100644 --- a/llvm/unittests/TargetParser/TargetParserTest.cpp +++ b/llvm/unittests/TargetParser/TargetParserTest.cpp @@ -1167,6 +1167,7 @@ INSTANTIATE_TEST_SUITE_P( AArch64CPUTestParams("a64fx", "armv8.2-a"), AArch64CPUTestParams("fujitsu-monaka", "armv9.3-a"), AArch64CPUTestParams("carmel", "armv8.2-a"), + AArch64CPUTestParams("grace", "armv9-a"), AArch64CPUTestParams("saphira", "armv8.4-a"), AArch64CPUTestParams("oryon-1", "armv8.6-a")), AArch64CPUTestParams::PrintToStringParamName); @@ -1247,7 +1248,6 @@ TEST_P(AArch64CPUAliasTestFixture, testCPUAlias) { INSTANTIATE_TEST_SUITE_P( AArch64CPUAliasTests, AArch64CPUAliasTestFixture, ::testing::Values(AArch64CPUAliasTestParams({"neoverse-n2", "cobalt-100"}), - AArch64CPUAliasTestParams({"neoverse-v2", "grace"}), AArch64CPUAliasTestParams({"apple-a7", "cyclone", "apple-a8", "apple-a9"}), AArch64CPUAliasTestParams({"apple-a12", "apple-s4", diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td index 69745addfd748..f795dd89b79a1 100644 --- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td +++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td @@ -602,7 +602,7 @@ def AMDGPU_MFMAOp : order (that is, v[0] will go to arg[7:0], v[1] to arg[15:8] and so on). The negateA, negateB, and negateC flags are only supported for double-precision - operations on gfx940+. + operations on gfx94x. }]; let assemblyFormat = [{ $sourceA `*` $sourceB `+` $destC diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td index 01059e42974d0..e9dcd112ce54e 100644 --- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td +++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td @@ -348,11 +348,11 @@ def ROCDL_mfma_f32_16x16x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x4bf16.1k"> def ROCDL_mfma_f32_4x4x4bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.4x4x4bf16.1k">; def ROCDL_mfma_f32_32x32x8bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x8bf16.1k">; def ROCDL_mfma_f32_16x16x16bf16_1k : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x16bf16.1k">; -// Note: in gfx940, unlike in gfx90a, the f64 xdlops use the "blgp" argument as a -// NEG bitfield. See IntrinsicsAMDGPU.td for more info. +// Note: in gfx94x, unlike in gfx90a, the f64 xdlops use the "blgp" argument as +// a NEG bitfield. See IntrinsicsAMDGPU.td for more info. def ROCDL_mfma_f64_16x16x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.16x16x4f64">; def ROCDL_mfma_f64_4x4x4f64 : ROCDL_Mfma_IntrOp<"mfma.f64.4x4x4f64">; -// New in gfx940. +// New in gfx94x. def ROCDL_mfma_i32_16x16x32_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.16x16x32.i8">; def ROCDL_mfma_i32_32x32x16_i8 : ROCDL_Mfma_IntrOp<"mfma.i32.32x32x16.i8">; def ROCDL_mfma_f32_16x16x8_xf32 : ROCDL_Mfma_IntrOp<"mfma.f32.16x16x8.xf32">; @@ -375,7 +375,7 @@ def ROCDL_mfma_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.f16">; def ROCDL_mfma_scale_f32_16x16x128_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.16x16x128.f8f6f4", [0,1]>; def ROCDL_mfma_scale_f32_32x32x64_f8f6f4 : ROCDL_Mfma_OO_IntrOp<"mfma.scale.f32.32x32x64.f8f6f4", [0,1]>; -// 2:4 Sparsity ops (GFX940) +// 2:4 Sparsity ops (GFX94x) def ROCDL_smfmac_f32_16x16x32_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.f16">; def ROCDL_smfmac_f32_32x32x16_f16 : ROCDL_Mfma_IntrOp<"smfmac.f32.32x32x16.f16">; def ROCDL_smfmac_f32_16x16x32_bf16 : ROCDL_Mfma_IntrOp<"smfmac.f32.16x16x32.bf16">; diff --git a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td index 6a439bfb09078..a5725d6f1507e 100644 --- a/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td +++ b/mlir/include/mlir/Dialect/Linalg/IR/LinalgStructuredOps.td @@ -858,7 +858,11 @@ def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSiz let arguments = (ins Variadic:$inputs, Variadic:$outputs, - DefaultValuedOptionalAttr:$indexing_maps + DefaultValuedOptionalAttr< + AffineMapArrayAttr, + "BatchMatmulOp::getDefaultIndexingMaps($_builder.getContext())" + >:$indexing_maps, + DefaultValuedOptionalAttr:$cast ); let results = (outs Variadic:$result_tensors); let regions = (region AnyRegion:$region); @@ -884,9 +888,10 @@ def BatchMatmulOp : LinalgStructuredBase_Op<"batch_matmul", !listconcat([AttrSiz }]>, OpBuilder< (ins "TypeRange":$resultTensorTypes, "ValueRange":$operands, - CArg<"ArrayRef", "{}">:$attributes), + "Attribute":$cast, CArg<"ArrayRef", "{}">:$attributes), [{ $_state.addOperands(operands); + $_state.addAttribute("cast", cast); $_state.addAttributes(attributes); $_state.addTypes(resultTensorTypes); (void)$_state.addRegion(), diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp index c62314e504dcc..b29228ef87ea7 100644 --- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp +++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp @@ -80,7 +80,7 @@ namespace { // Define commonly used chipsets versions for convenience. constexpr Chipset kGfx908 = Chipset(9, 0, 8); constexpr Chipset kGfx90a = Chipset(9, 0, 0xa); -constexpr Chipset kGfx940 = Chipset(9, 4, 0); +constexpr Chipset kGfx942 = Chipset(9, 4, 2); /// Define lowering patterns for raw buffer ops template @@ -483,7 +483,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, destElem = destType.getElementType(); if (sourceElem.isF32() && destElem.isF32()) { - if (mfma.getReducePrecision() && chipset >= kGfx940) { + if (mfma.getReducePrecision() && chipset >= kGfx942) { if (m == 32 && n == 32 && k == 4 && b == 1) return ROCDL::mfma_f32_32x32x4_xf32::getOperationName(); if (m == 16 && n == 16 && k == 8 && b == 1) @@ -551,9 +551,9 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, return ROCDL::mfma_i32_32x32x8i8::getOperationName(); if (m == 16 && n == 16 && k == 16 && b == 1) return ROCDL::mfma_i32_16x16x16i8::getOperationName(); - if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx940) + if (m == 32 && n == 32 && k == 16 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_32x32x16_i8::getOperationName(); - if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx940) + if (m == 16 && n == 16 && k == 32 && b == 1 && chipset >= kGfx942) return ROCDL::mfma_i32_16x16x32_i8::getOperationName(); } @@ -565,7 +565,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, } if (isa(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { // Known to be correct because there are no scalar f8 instructions and // because a length mismatch will have been caught by the verifier. Type sourceBElem = @@ -585,7 +585,7 @@ static std::optional mfmaOpToIntrinsic(MFMAOp mfma, } if (isa(sourceElem) && destElem.isF32() && - chipset >= kGfx940) { + chipset >= kGfx942) { Type sourceBElem = cast(mfma.getSourceB().getType()).getElementType(); if (m == 16 && n == 16 && k == 32 && b == 1) { @@ -653,8 +653,8 @@ struct MFMAOpLowering : public ConvertOpToLLVMPattern { return op->emitOpError("MFMA only supported on gfx908+"); uint32_t getBlgpField = static_cast(op.getBlgp()); if (op.getNegateA() || op.getNegateB() || op.getNegateC()) { - if (chipset < kGfx940) - return op.emitOpError("negation unsupported on older than gfx940"); + if (chipset < kGfx942) + return op.emitOpError("negation unsupported on older than gfx942"); getBlgpField |= op.getNegateA() | (op.getNegateB() << 1) | (op.getNegateC() << 2); } @@ -775,7 +775,7 @@ LogicalResult ExtPackedFp8OpLowering::matchAndRewrite( ExtPackedFp8Op op, ExtPackedFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -819,7 +819,7 @@ LogicalResult PackedTrunc2xFp8OpLowering::matchAndRewrite( PackedTrunc2xFp8Op op, PackedTrunc2xFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -856,7 +856,7 @@ LogicalResult PackedStochRoundFp8OpLowering::matchAndRewrite( PackedStochRoundFp8Op op, PackedStochRoundFp8OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { Location loc = op.getLoc(); - if (chipset.majorVersion != 9 || chipset < kGfx940) + if (chipset.majorVersion != 9 || chipset < kGfx942) return rewriter.notifyMatchFailure( loc, "Fp8 conversion instructions are not available on target " "architecture and their emulation is not implemented"); @@ -1038,7 +1038,7 @@ struct AMDGPUDPPLowering : public ConvertOpToLLVMPattern { struct ConvertAMDGPUToROCDLPass : public impl::ConvertAMDGPUToROCDLPassBase { - using ConvertAMDGPUToROCDLPassBase::ConvertAMDGPUToROCDLPassBase; + using Base::Base; void runOnOperation() override { MLIRContext *ctx = &getContext(); diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp index 60a002c41bfb2..b22d852f7c543 100644 --- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp +++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp @@ -384,7 +384,7 @@ void ArithToAMDGPUConversionPass::runOnOperation() { } bool convertFP8Arithmetic = - maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 0); + maybeChipset->majorVersion == 9 && *maybeChipset >= Chipset(9, 4, 2); arith::populateArithToAMDGPUConversionPatterns( patterns, convertFP8Arithmetic, saturateFP8Truncf, allowPackedF16Rtz, *maybeChipset); diff --git a/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp b/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp index 5887e37b7f0b4..1f2781aa82114 100644 --- a/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp +++ b/mlir/lib/Conversion/ArithToSPIRV/ArithToSPIRV.cpp @@ -1338,7 +1338,7 @@ void mlir::arith::populateArithToSPIRVPatterns( namespace { struct ConvertArithToSPIRVPass : public impl::ConvertArithToSPIRVPassBase { - using ConvertArithToSPIRVPassBase::ConvertArithToSPIRVPassBase; + using Base::Base; void runOnOperation() override { Operation *op = getOperation(); diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp index ea25d5afaeeca..5089179435f1e 100644 --- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp +++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp @@ -1072,7 +1072,7 @@ namespace { struct ConvertComplexToStandardPass : public impl::ConvertComplexToStandardPassBase< ConvertComplexToStandardPass> { - using ConvertComplexToStandardPassBase::ConvertComplexToStandardPassBase; + using Base::Base; void runOnOperation() override; }; diff --git a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp index a0ae39a353a95..03f4bf4df4912 100644 --- a/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp +++ b/mlir/lib/Conversion/ControlFlowToSPIRV/ControlFlowToSPIRVPass.cpp @@ -28,7 +28,7 @@ namespace { class ConvertControlFlowToSPIRVPass final : public impl::ConvertControlFlowToSPIRVPassBase< ConvertControlFlowToSPIRVPass> { - using ConvertControlFlowToSPIRVPassBase::ConvertControlFlowToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp b/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp index 572a432d6d641..8ed9f659afb10 100644 --- a/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp +++ b/mlir/lib/Conversion/FuncToSPIRV/FuncToSPIRVPass.cpp @@ -27,7 +27,7 @@ namespace { /// A pass converting MLIR Func operations into the SPIR-V dialect. class ConvertFuncToSPIRVPass : public impl::ConvertFuncToSPIRVPassBase { - using ConvertFuncToSPIRVPassBase::ConvertFuncToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp index 5d53aef199d52..b06ab44d159af 100644 --- a/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp +++ b/mlir/lib/Conversion/MemRefToSPIRV/MemRefToSPIRVPass.cpp @@ -27,7 +27,7 @@ namespace { /// A pass converting MLIR MemRef operations into the SPIR-V dialect. class ConvertMemRefToSPIRVPass : public impl::ConvertMemRefToSPIRVPassBase { - using ConvertMemRefToSPIRVPassBase::ConvertMemRefToSPIRVPassBase; + using Base::Base; void runOnOperation() override; }; } // namespace diff --git a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp index 8e2efbc7f4280..99631705851fd 100644 --- a/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp +++ b/mlir/lib/Conversion/SCFToGPU/SCFToGPUPass.cpp @@ -34,7 +34,7 @@ namespace { // walk the function recursively to avoid considering nested loops. struct ForLoopMapper : public impl::ConvertAffineForToGPUPassBase { - using ConvertAffineForToGPUPassBase::ConvertAffineForToGPUPassBase; + using Base::Base; void runOnOperation() override { for (Operation &op : llvm::make_early_inc_range( diff --git a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp index 9e98dc7d7aaf6..f07386ea80124 100644 --- a/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp +++ b/mlir/lib/Conversion/TensorToSPIRV/TensorToSPIRVPass.cpp @@ -29,7 +29,7 @@ namespace { /// A pass converting MLIR Tensor operations into the SPIR-V dialect. class ConvertTensorToSPIRVPass : public impl::ConvertTensorToSPIRVPassBase { - using ConvertTensorToSPIRVPassBase::ConvertTensorToSPIRVPassBase; + using Base::Base; void runOnOperation() override { MLIRContext *context = &getContext(); diff --git a/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp b/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp index faedf93d15037..188a18928926f 100644 --- a/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp +++ b/mlir/lib/Conversion/TosaToArith/TosaToArithPass.cpp @@ -31,7 +31,7 @@ using namespace tosa; namespace { struct TosaToArith : public impl::TosaToArithPassBase { - using TosaToArithPassBase::TosaToArithPassBase; + using Base::Base; void runOnOperation() override { TypeConverter converter; diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp index 77f972e0e5894..7459a6503cddf 100644 --- a/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp +++ b/mlir/lib/Dialect/AMDGPU/Transforms/EmulateAtomics.cpp @@ -179,7 +179,7 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } // gfx9 has no to a very limited support for floating-point min and max. if (chipset.majorVersion == 9) { - if (chipset >= Chipset(9, 0, 0xa) && chipset != Chipset(9, 4, 1)) { + if (chipset >= Chipset(9, 0, 0xa)) { // gfx90a supports f64 max (and min, but we don't have a min wrapper right // now) but all other types need to be emulated. target.addDynamicallyLegalOp( @@ -189,12 +189,6 @@ void mlir::amdgpu::populateAmdgpuEmulateAtomicsPatterns( } else { target.addIllegalOp(); } - if (chipset == Chipset(9, 4, 1)) { - // gfx941 requires non-CAS atomics to be implemented with CAS loops. - // The workaround here mirrors HIP and OpenMP. - target.addIllegalOp(); - } } patterns.add< RawBufferAtomicByCasPattern, diff --git a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp index 0896e7ef51e56..a8da7ecce26df 100644 --- a/mlir/lib/Dialect/Affine/IR/AffineOps.cpp +++ b/mlir/lib/Dialect/Affine/IR/AffineOps.cpp @@ -3937,14 +3937,24 @@ LogicalResult AffineParallelOp::verify() { } unsigned expectedNumLBResults = 0; - for (APInt v : getLowerBoundsGroups()) - expectedNumLBResults += v.getZExtValue(); + for (APInt v : getLowerBoundsGroups()) { + unsigned results = v.getZExtValue(); + if (results == 0) + return emitOpError() + << "expected lower bound map to have at least one result"; + expectedNumLBResults += results; + } if (expectedNumLBResults != getLowerBoundsMap().getNumResults()) return emitOpError() << "expected lower bounds map to have " << expectedNumLBResults << " results"; unsigned expectedNumUBResults = 0; - for (APInt v : getUpperBoundsGroups()) - expectedNumUBResults += v.getZExtValue(); + for (APInt v : getUpperBoundsGroups()) { + unsigned results = v.getZExtValue(); + if (results == 0) + return emitOpError() + << "expected upper bound map to have at least one result"; + expectedNumUBResults += results; + } if (expectedNumUBResults != getUpperBoundsMap().getNumResults()) return emitOpError() << "expected upper bounds map to have " << expectedNumUBResults << " results"; diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp index 71ea0fd9d43cd..77840690e6a26 100644 --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotModuleBufferize.cpp @@ -289,35 +289,6 @@ static func::FuncOp getCalledFunction(func::CallOp callOp) { SymbolTable::lookupNearestSymbolFrom(callOp, sym)); } -/// Gather equivalence info of CallOps. -/// Note: This only adds new equivalence info if the called function was already -/// analyzed. -// TODO: This does not handle cyclic function call graphs etc. -static void equivalenceAnalysis(func::FuncOp funcOp, - OneShotAnalysisState &state, - FuncAnalysisState &funcState) { - funcOp->walk([&](func::CallOp callOp) { - func::FuncOp calledFunction = getCalledFunction(callOp); - assert(calledFunction && "could not retrieved called func::FuncOp"); - - // No equivalence info available for the called function. - if (!funcState.equivalentFuncArgs.count(calledFunction)) - return WalkResult::skip(); - - for (auto it : funcState.equivalentFuncArgs[calledFunction]) { - int64_t returnIdx = it.first; - int64_t bbargIdx = it.second; - if (!state.isInPlace(callOp->getOpOperand(bbargIdx))) - continue; - Value returnVal = callOp.getResult(returnIdx); - Value argVal = callOp->getOperand(bbargIdx); - state.unionEquivalenceClasses(returnVal, argVal); - } - - return WalkResult::advance(); - }); -} - /// Return "true" if the given function signature has tensor semantics. static bool hasTensorSignature(func::FuncOp funcOp) { return llvm::any_of(funcOp.getFunctionType().getInputs(), @@ -493,9 +464,6 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp, // Now analyzing function. funcState.startFunctionAnalysis(funcOp); - // Gather equivalence info for CallOps. - equivalenceAnalysis(funcOp, state, funcState); - // Analyze funcOp. if (failed(analyzeOp(funcOp, state, statistics))) return failure(); @@ -514,9 +482,6 @@ mlir::bufferization::analyzeModuleOp(ModuleOp moduleOp, if (!state.getOptions().isOpAllowed(funcOp)) continue; - // Gather equivalence info for CallOps. - equivalenceAnalysis(funcOp, state, funcState); - // Analyze funcOp. if (failed(analyzeOp(funcOp, state, statistics))) return failure(); diff --git a/mlir/lib/Dialect/DLTI/DLTI.cpp b/mlir/lib/Dialect/DLTI/DLTI.cpp index b057554c40d8c..70e05cb4cb383 100644 --- a/mlir/lib/Dialect/DLTI/DLTI.cpp +++ b/mlir/lib/Dialect/DLTI/DLTI.cpp @@ -571,7 +571,8 @@ FailureOr dlti::query(Operation *op, ArrayRef keys, return failure(); MLIRContext *ctx = op->getContext(); - SmallVector entryKeys(keys.size()); + SmallVector entryKeys; + entryKeys.reserve(keys.size()); for (StringRef key : keys) entryKeys.push_back(StringAttr::get(ctx, key)); diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp index 0fc23a7425477..a71428985c54e 100644 --- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp +++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp @@ -3968,11 +3968,18 @@ void BatchMatmulOp::regionBuilder(ImplicitLocOpBuilder &b, Block &block, RegionBuilderHelper helper(b, block); SmallVector yields; + TypeFn castVal = TypeFn::cast_signed; + auto castIter = llvm::find_if(attrs, [&](const NamedAttribute &attr) { + return attr.getName() == "cast"; + }); + if (castIter != attrs.end()) { + if (auto attr = llvm::dyn_cast(castIter->getValue())) + castVal = attr.getValue(); + } + auto toType = block.getArgument(2).getType(); - Value castValA = - helper.buildTypeFn(TypeFn::cast_signed, toType, block.getArgument(0)); - Value castValB = - helper.buildTypeFn(TypeFn::cast_signed, toType, block.getArgument(1)); + Value castValA = helper.buildTypeFn(castVal, toType, block.getArgument(0)); + Value castValB = helper.buildTypeFn(castVal, toType, block.getArgument(1)); Value mulVal = helper.buildBinaryFn(BinaryFn::mul, castValA, castValB); Value addVal = helper.buildBinaryFn(BinaryFn::add, block.getArgument(2), mulVal); @@ -4021,11 +4028,6 @@ ParseResult BatchMatmulOp::parse(OpAsmParser &parser, OperationState &result) { } void BatchMatmulOp::print(OpAsmPrinter &p) { - SmallVector elidedAttrs = { - "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; - ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), - elidedAttrs); - SmallVector indexingMaps = llvm::map_to_vector( BatchMatmulOp::getDefaultIndexingMaps(getContext()), [](AffineMap map) -> Attribute { return AffineMapAttr::get(map); }); @@ -4035,6 +4037,11 @@ void BatchMatmulOp::print(OpAsmPrinter &p) { [&](Attribute attr) { p.printAttribute(attr); }); p << "]"; } + + SmallVector elidedAttrs = { + "operandSegmentSizes", "linalg.memoized_indexing_maps", "indexing_maps"}; + ::printNamedStructuredOp(p, getOperation(), getInputs(), getOutputs(), + elidedAttrs); } /// Verify the user defined indexing maps. diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp index 62e1c4c3ed3b1..d725a457aeff6 100644 --- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp +++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp @@ -32,7 +32,6 @@ #include "llvm/ADT/TypeSwitch.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h" -#include "llvm/Support/Casting.h" #include #include #include diff --git a/mlir/python/mlir/dialects/linalg/__init__.py b/mlir/python/mlir/dialects/linalg/__init__.py index 5cda4769d593f..c5fbb833ee399 100644 --- a/mlir/python/mlir/dialects/linalg/__init__.py +++ b/mlir/python/mlir/dialects/linalg/__init__.py @@ -149,7 +149,8 @@ def __init__( generic = region_op(GenericOp_, terminator=YieldOp) -def matmul( +def create_op( + op_type, *ins: Union[Operation, OpView, Value], outs: Sequence[Union[Operation, OpView, Value]], indexing_maps: Optional[Sequence[AffineMapAttr]] = None, @@ -161,7 +162,7 @@ def matmul( init = _get_op_result_or_value(outs[0]) result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] - op = MatmulOp( + op = op_type( result_tensors=result_types, inputs=ins, outputs=[init], @@ -172,24 +173,32 @@ def matmul( return op +def matmul( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Optional[Sequence[AffineMapAttr]] = None, + cast: Optional[Union[TypeFn, Attribute]] = None, +): + return create_op(MatmulOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast) + + +def batch_matmul( + *ins: Union[Operation, OpView, Value], + outs: Sequence[Union[Operation, OpView, Value]], + indexing_maps: Optional[Sequence[AffineMapAttr]] = None, + cast: Optional[Union[TypeFn, Attribute]] = None, +): + return create_op( + BatchMatmulOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast + ) + + def contract( *ins: Union[Operation, OpView, Value], outs: Sequence[Union[Operation, OpView, Value]], indexing_maps: Sequence[AffineMapAttr], cast: Optional[Union[TypeFn, Attribute]] = None, ): - ins = [_get_op_result_or_value(input) for input in ins] - if len(outs) > 1: - raise ValueError(f"{outs=} must have length 1.") - init = _get_op_result_or_value(outs[0]) - result_types = [init.type] if isinstance(init.type, RankedTensorType) else [] - - op = ContractOp( - result_tensors=result_types, - inputs=ins, - outputs=[init], - indexing_maps=indexing_maps, - cast=cast, + return create_op( + ContractOp, *ins, outs=outs, indexing_maps=indexing_maps, cast=cast ) - fill_builtin_region(op.operation) - return op diff --git a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir index 7818a525d17b5..a313aaffdf5cc 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 | FileCheck %s // CHECK-LABEL: func @ext_scalar // CHECK: [[V:%.+]] = builtin.unrealized_conversion_cast %{{.+}} : f8E5M2FNUZ to i8 diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir index f8a60d37801eb..52db1421dc3c6 100644 --- a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir +++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx940 -cse | FileCheck %s +// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx942 -cse | FileCheck %s func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>, %arg2 : vector<16xf32>, %arg3 : vector<4xf32>, %arg4 : vector<4xf16>, %arg5 : vector<4xi8>, diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir index cd921da2294e1..07a428566d488 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir @@ -1,5 +1,5 @@ // RUN: mlir-opt --split-input-file %s \ -// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx940 saturate-fp8-truncf=true}))' \ +// RUN: --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{chipset=gfx942 saturate-fp8-truncf=true}))' \ // RUN: | FileCheck %s // CHECK-LABEL: func.func @scalar_trunc diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir index 985fb532ea74a..6bb5b9771c015 100644 --- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir +++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir @@ -1,4 +1,4 @@ -// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx940" | FileCheck %s +// RUN: mlir-opt --split-input-file %s -convert-arith-to-amdgpu="chipset=gfx942" | FileCheck %s // CHECK-LABEL: func.func @scalar_ext // CHECK-SAME: ([[V:%.+]]: f8E5M2FNUZ) diff --git a/mlir/test/Dialect/Affine/invalid.mlir b/mlir/test/Dialect/Affine/invalid.mlir index 44e484b9ba598..da2913e3fec28 100644 --- a/mlir/test/Dialect/Affine/invalid.mlir +++ b/mlir/test/Dialect/Affine/invalid.mlir @@ -297,6 +297,24 @@ func.func @affine_parallel(%arg0 : index, %arg1 : index, %arg2 : index) { // ----- +func.func @no_upper_bound_affine_parallel() { + // expected-error@+1 {{expected lower bound map to have at least one result}} + affine.parallel (%arg2) = (max()) to (1) { + } + return +} + +// ----- + +func.func @no_upper_bound_affine_parallel() { + // expected-error@+1 {{expected upper bound map to have at least one result}} + affine.parallel (%arg3) = (0) to (min()) { + } + return +} + +// ----- + func.func @vector_load_invalid_vector_type() { %0 = memref.alloc() : memref<100xf32> affine.for %i0 = 0 to 16 step 8 { diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir index 2ca7f7109005c..c947407c63e74 100644 --- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir @@ -1,4 +1,5 @@ // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only dump-alias-sets" -split-input-file | FileCheck %s --check-prefix=CHECK-ALIAS // Run fuzzer with different seeds. // RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null @@ -1406,3 +1407,21 @@ func.func @caller(%c: i1, %t0: tensor<5xf32>, %t1: tensor<5xf32>, %t2: tensor<5x return %r : tensor<5xf32> } +// ----- + +// CHECK-ALIAS-LABEL: func @foo +func.func @foo(%arg0: tensor) -> tensor { + // CHECK-ALIAS: return + // CHECK-ALIAS-SAME: __equivalent_func_args__ = [0] + return %arg0 : tensor +} + +// CHECK-ALIAS: func @bar(%[[arg0:.*]]: tensor +func.func @bar(%arg0: tensor) -> tensor { + // CHECK-ALIAS: %[[call:.*]] = call @foo(%[[arg0]]) + // CHECK-ALIAS-SAME: {__inplace_operands_attr__ = ["true"], __opresult_alias_set_attr__ = [{{\[}}"%[[call]]", "%[[arg0]]"]]} + %x = call @foo(%arg0) : (tensor) -> tensor + // CHECK-ALIAS: return + // CHECK-ALIAS-SAME: __equivalent_func_args__ = [0] + return %x : tensor +} diff --git a/mlir/test/Dialect/Linalg/named-ops.mlir b/mlir/test/Dialect/Linalg/named-ops.mlir index 8474eeac0db5b..1bd9c8825b05e 100644 --- a/mlir/test/Dialect/Linalg/named-ops.mlir +++ b/mlir/test/Dialect/Linalg/named-ops.mlir @@ -1497,7 +1497,7 @@ func.func @matmul_transpose_b(%arg0: memref<3x5xf32>, %arg1: memref<7x5xf32>, %a // CHECK-SAME: %[[VAL_0:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_k_to_fill_missing_dims_A(%arg0: memref<5xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) { @@ -1520,7 +1520,7 @@ func.func @batch_matmul_bcast_k_to_fill_missing_dims_A(%arg0: memref<5xf32>, %ar // CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_batch_dim_A(%arg0: memref<3x5xf32>, %arg1: memref<2x5x7xf32>, %arg2: memref<2x3x7xf32>) { @@ -1543,7 +1543,7 @@ func.func @batch_matmul_bcast_batch_dim_A(%arg0: memref<3x5xf32>, %arg1: memref< // CHECK-SAME: %[[VAL_0:.*]]: memref<2x3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_batch_and_n_dim_B(%arg0: memref<2x3x5xf32>, %arg1: memref<5xf32>, %arg2: memref<2x3x7xf32>) { @@ -1566,7 +1566,7 @@ func.func @batch_matmul_bcast_batch_and_n_dim_B(%arg0: memref<2x3x5xf32>, %arg1: // CHECK-SAME: %[[VAL_0:.*]]: memref<2x3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<5x7xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<2x3x5xf32>, memref<5x7xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } @@ -1622,7 +1622,7 @@ func.func @batch_matmul_explicit_transpose_B(%arg0: memref<2x3x5xf32>, %arg1: me // CHECK-SAME: %[[VAL_0:.*]]: memref<3x5xf32>, // CHECK-SAME: %[[VAL_1:.*]]: memref<2x7x5xf32>, // CHECK-SAME: %[[VAL_2:.*]]: memref<2x3x7xf32>) { -// CHECK: linalg.batch_matmul ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x7x5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] +// CHECK: linalg.batch_matmul indexing_maps = [#[[$ATTR_0]], #[[$ATTR_1]], #[[$ATTR_2]]] ins(%[[VAL_0]], %[[VAL_1]] : memref<3x5xf32>, memref<2x7x5xf32>) outs(%[[VAL_2]] : memref<2x3x7xf32>) // CHECK: return // CHECK: } func.func @batch_matmul_bcast_A_transpose_B(%arg0: memref<3x5xf32>, %arg1: memref<2x7x5xf32>, %arg2: memref<2x3x7xf32>) { diff --git a/mlir/test/python/dialects/linalg/ops.py b/mlir/test/python/dialects/linalg/ops.py index 94f8ea4faf4a8..307a88709ad52 100644 --- a/mlir/test/python/dialects/linalg/ops.py +++ b/mlir/test/python/dialects/linalg/ops.py @@ -466,3 +466,103 @@ def matmul_as_contract_op( ) print(module) + + +# CHECK-LABEL: TEST: testBatchMatmulOp +@run +def testBatchMatmulOp(): + with Context(), Location.unknown(): + module = Module.create() + f32 = F32Type.get() + with InsertionPoint(module.body): + a_shape = (2, 4, 8) + b_shape = (2, 8, 12) + b_transposed_shape = (2, 12, 8) + c_shape = (2, 4, 12) + + dimBatch = ir.AffineDimExpr.get(0) + dimM = ir.AffineDimExpr.get(1) + dimN = ir.AffineDimExpr.get(2) + dimK = ir.AffineDimExpr.get(3) + + # CHECK: #[[$A_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)> + # CHECK: #[[$BTrans_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d2, d3)> + # CHECK: #[[$C_MAP:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)> + + a_map = ir.AffineMap.get(4, 0, [dimBatch, dimM, dimK]) + b_transposed_map = ir.AffineMap.get(4, 0, [dimBatch, dimN, dimK]) + c_map = ir.AffineMap.get(4, 0, [dimBatch, dimM, dimN]) + + # CHECK: func.func @batch_matmul_op( + @func.FuncOp.from_py_func( + # CHECK-SAME: %[[A:.*]]: tensor<2x4x8xf32>, + RankedTensorType.get(a_shape, f32), + # CHECK-SAME: %[[Amem:.*]]: memref<2x4x8xf32>, + MemRefType.get(a_shape, f32), + # CHECK-SAME: %[[B:.*]]: tensor<2x8x12xf32>, + RankedTensorType.get(b_shape, f32), + # CHECK-SAME: %[[Bmem:.*]]: memref<2x8x12xf32>, + MemRefType.get(b_shape, f32), + # CHECK-SAME: %[[BTrans:.*]]: tensor<2x12x8xf32>, + RankedTensorType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[BTransmem:.*]]: memref<2x12x8xf32>, + MemRefType.get(b_transposed_shape, f32), + # CHECK-SAME: %[[C:.*]]: tensor<2x4x12xf32>, + RankedTensorType.get(c_shape, f32), + # CHECK-SAME: %[[Cmem:.*]]: memref<2x4x12xf32>) + MemRefType.get(c_shape, f32), + ) + def batch_matmul_op(A, Amem, B, Bmem, Btransposed, Btransposedmem, C, Cmem): + # CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x8x12xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=(C.type,), + inputs=(A, B), + outputs=(C,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul ins(%[[A]], %[[B]] : tensor<2x4x8xf32>, tensor<2x8x12xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.batch_matmul(A, B, outs=(C,)) + + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<2x4x8xf32>, tensor<2x12x8xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=(C.type,), + inputs=(A, Btransposed), + outputs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[A]], %[[BTrans]] : tensor<2x4x8xf32>, tensor<2x12x8xf32>) outs(%[[C]] : tensor<2x4x12xf32>) + res = linalg.batch_matmul( + A, + Btransposed, + outs=(C,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + # CHECK: linalg.batch_matmul ins(%[[Amem]], %[[Bmem]] : memref<2x4x8xf32>, memref<2x8x12xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=[], + inputs=(Amem, Bmem), + outputs=(Cmem,), + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul ins(%[[Amem]], %[[Bmem]] : memref<2x4x8xf32>, memref<2x8x12xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + linalg.batch_matmul(Amem, Bmem, outs=(Cmem,)) + + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<2x4x8xf32>, memref<2x12x8xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + res = linalg.BatchMatmulOp( + result_tensors=[], + inputs=(Amem, Btransposedmem), + outputs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + linalg.fill_builtin_region(res.operation) + # CHECK: linalg.batch_matmul indexing_maps = [#[[$A_MAP]], #[[$BTrans_MAP]], #[[$C_MAP]]] ins(%[[Amem]], %[[BTransmem]] : memref<2x4x8xf32>, memref<2x12x8xf32>) outs(%[[Cmem]] : memref<2x4x12xf32>) + linalg.batch_matmul( + Amem, + Btransposedmem, + outs=(Cmem,), + indexing_maps=[a_map, b_transposed_map, c_map], + ) + + print(module) diff --git a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp index bbb6bd6617a13..87df9e19d1842 100644 --- a/mlir/tools/mlir-rewrite/mlir-rewrite.cpp +++ b/mlir/tools/mlir-rewrite/mlir-rewrite.cpp @@ -348,11 +348,11 @@ static mlir::RewriterRegistration rewriteMarkRanges("mark-ranges", "Indicate ranges parsed", markRanges); int main(int argc, char **argv) { - static llvm::cl::opt inputFilename( - llvm::cl::Positional, llvm::cl::desc(""), - llvm::cl::init("-")); + llvm::cl::opt inputFilename(llvm::cl::Positional, + llvm::cl::desc(""), + llvm::cl::init("-")); - static llvm::cl::opt outputFilename( + llvm::cl::opt outputFilename( "o", llvm::cl::desc("Output filename"), llvm::cl::value_desc("filename"), llvm::cl::init("-")); diff --git a/mlir/tools/mlir-runner/mlir-runner.cpp b/mlir/tools/mlir-runner/mlir-runner.cpp index 7e8793de03ead..932c9f6cc9fdc 100644 --- a/mlir/tools/mlir-runner/mlir-runner.cpp +++ b/mlir/tools/mlir-runner/mlir-runner.cpp @@ -32,7 +32,7 @@ using namespace mlir; // TODO: Consider removing this linking functionality from the SPIR-V CPU Runner // flow in favour of a more proper host/device split like other runners. // https://github.com/llvm/llvm-project/issues/115348 -llvm::cl::opt LinkNestedModules( +static llvm::cl::opt LinkNestedModules( "link-nested-modules", llvm::cl::desc("Link two nested MLIR modules into a single LLVM IR module. " "Useful if both the host and device code can be run on the " diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp index 414cad5e1dcc2..6cf71d2bb0174 100644 --- a/mlir/tools/mlir-tblgen/DialectGen.cpp +++ b/mlir/tools/mlir-tblgen/DialectGen.cpp @@ -34,7 +34,7 @@ using llvm::Record; using llvm::RecordKeeper; static llvm::cl::OptionCategory dialectGenCat("Options for -gen-dialect-*"); -llvm::cl::opt +static llvm::cl::opt selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"), llvm::cl::cat(dialectGenCat), llvm::cl::CommaSeparated); diff --git a/mlir/tools/mlir-tblgen/OpDocGen.cpp b/mlir/tools/mlir-tblgen/OpDocGen.cpp index 43d406e4340f7..dbaad84cda5d6 100644 --- a/mlir/tools/mlir-tblgen/OpDocGen.cpp +++ b/mlir/tools/mlir-tblgen/OpDocGen.cpp @@ -44,11 +44,11 @@ using mlir::tblgen::Operator; //===----------------------------------------------------------------------===// static cl::OptionCategory docCat("Options for -gen-(attrdef|typedef|enum|op|dialect)-doc"); -cl::opt +static cl::opt stripPrefix("strip-prefix", cl::desc("Strip prefix of the fully qualified names"), cl::init("::mlir::"), cl::cat(docCat)); -cl::opt allowHugoSpecificFeatures( +static cl::opt allowHugoSpecificFeatures( "allow-hugo-specific-features", cl::desc("Allows using features specific to Hugo"), cl::init(false), cl::cat(docCat)); diff --git a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp index c9f6dd35de44e..c2ad09ffaaed5 100644 --- a/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp +++ b/mlir/tools/tblgen-to-irdl/OpDefinitionsGen.cpp @@ -36,7 +36,7 @@ using namespace mlir; using tblgen::NamedTypeConstraint; static llvm::cl::OptionCategory dialectGenCat("Options for -gen-irdl-dialect"); -llvm::cl::opt +static llvm::cl::opt selectedDialect("dialect", llvm::cl::desc("The dialect to gen for"), llvm::cl::cat(dialectGenCat), llvm::cl::Required); diff --git a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp index 976ff2e7382ed..570d56f3c6ff1 100644 --- a/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp +++ b/mlir/unittests/Dialect/AMDGPU/AMDGPUUtilsTest.cpp @@ -19,11 +19,11 @@ TEST(ChipsetTest, Parsing) { EXPECT_EQ(chipset->minorVersion, 0u); EXPECT_EQ(chipset->steppingVersion, 0xau); - chipset = Chipset::parse("gfx940"); + chipset = Chipset::parse("gfx942"); ASSERT_TRUE(succeeded(chipset)); EXPECT_EQ(chipset->majorVersion, 9u); EXPECT_EQ(chipset->minorVersion, 4u); - EXPECT_EQ(chipset->steppingVersion, 0u); + EXPECT_EQ(chipset->steppingVersion, 2u); chipset = Chipset::parse("gfx1103"); ASSERT_TRUE(succeeded(chipset)); @@ -36,30 +36,26 @@ TEST(ChipsetTest, ParsingInvalid) { EXPECT_TRUE(failed(Chipset::parse("navi33"))); EXPECT_TRUE(failed(Chipset::parse("rdna2"))); EXPECT_TRUE(failed(Chipset::parse("sm_80"))); - EXPECT_TRUE(failed(Chipset::parse("GFX940"))); - EXPECT_TRUE(failed(Chipset::parse("Gfx940"))); + EXPECT_TRUE(failed(Chipset::parse("GFX942"))); + EXPECT_TRUE(failed(Chipset::parse("Gfx942"))); EXPECT_TRUE(failed(Chipset::parse("gfx9"))); - EXPECT_TRUE(failed(Chipset::parse("gfx_940"))); - EXPECT_TRUE(failed(Chipset::parse("gfx940_"))); + EXPECT_TRUE(failed(Chipset::parse("gfx_942"))); + EXPECT_TRUE(failed(Chipset::parse("gfx942_"))); EXPECT_TRUE(failed(Chipset::parse("gfxmeow"))); EXPECT_TRUE(failed(Chipset::parse("gfx1fff"))); } TEST(ChipsetTest, Comparison) { - EXPECT_EQ(Chipset(9, 4, 0), Chipset(9, 4, 0)); - EXPECT_NE(Chipset(9, 4, 0), Chipset(9, 4, 2)); + EXPECT_EQ(Chipset(9, 4, 2), Chipset(9, 4, 2)); EXPECT_NE(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(10, 0, 0)); EXPECT_LT(Chipset(9, 0, 0), Chipset(9, 4, 2)); - EXPECT_LE(Chipset(9, 4, 1), Chipset(9, 4, 1)); EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 4, 2) < Chipset(9, 4, 0)); EXPECT_GT(Chipset(9, 0, 0xa), Chipset(9, 0, 8)); EXPECT_GE(Chipset(9, 0, 0xa), Chipset(9, 0, 0xa)); - EXPECT_FALSE(Chipset(9, 4, 1) >= Chipset(9, 4, 2)); - EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 0)); + EXPECT_FALSE(Chipset(9, 0, 0xa) >= Chipset(9, 4, 2)); } } // namespace diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index 92184ba796dbd..e83d38a14f77f 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -2854,12 +2854,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { Error checkIfAPU() { // TODO: replace with ROCr API once it becomes available. llvm::StringRef StrGfxName(ComputeUnitKind); - IsAPU = llvm::StringSwitch(StrGfxName) - .Case("gfx940", true) - .Default(false); - if (IsAPU) - return Plugin::success(); - bool MayBeAPU = llvm::StringSwitch(StrGfxName) .Case("gfx942", true) .Default(false); diff --git a/offload/test/lit.cfg b/offload/test/lit.cfg index 1e265d2c30904..f017bca85dd4f 100644 --- a/offload/test/lit.cfg +++ b/offload/test/lit.cfg @@ -134,12 +134,10 @@ elif config.libomptarget_current_target.startswith('amdgcn'): # amdgpu_test_arch contains a list of AMD GPUs in the system # only check the first one assuming that we will run the test on it. if not (config.amdgpu_test_arch.startswith("gfx90a") or - config.amdgpu_test_arch.startswith("gfx940") or config.amdgpu_test_arch.startswith("gfx942")): supports_unified_shared_memory = False # check if AMD architecture is an APU: - if (config.amdgpu_test_arch.startswith("gfx940") or - (config.amdgpu_test_arch.startswith("gfx942") and + if ((config.amdgpu_test_arch.startswith("gfx942") and evaluate_bool_env(config.environment['IS_APU']))): supports_apu = True if supports_unified_shared_memory: