diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index ee77e83363d37..d0193a4ec2faa 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -37,7 +37,7 @@ jobs:
   stage1:
     if: github.repository_owner == 'llvm'
     runs-on: libcxx-self-hosted-linux
-    container: ghcr.io/llvm/libcxx-linux-builder:d8a0709b1090350a7fe3604d8ab78c7d62f10698
+    container: ghcr.io/llvm/libcxx-linux-builder:b319dfef21f6c7b0bc6a356d6b9f41a3b3b98ae9
     continue-on-error: false
     strategy:
       fail-fast: false
@@ -48,8 +48,8 @@ jobs:
           'generic-cxx26',
           'generic-modules'
         ]
-        cc: [  'clang-20' ]
-        cxx: [ 'clang++-20' ]
+        cc: [  'clang-21' ]
+        cxx: [ 'clang++-21' ]
         include:
           - config: 'generic-gcc'
             cc: 'gcc-14'
@@ -75,7 +75,7 @@ jobs:
   stage2:
     if: github.repository_owner == 'llvm'
     runs-on: libcxx-self-hosted-linux
-    container: ghcr.io/llvm/libcxx-linux-builder:d8a0709b1090350a7fe3604d8ab78c7d62f10698
+    container: ghcr.io/llvm/libcxx-linux-builder:b319dfef21f6c7b0bc6a356d6b9f41a3b3b98ae9
     needs: [ stage1 ]
     continue-on-error: false
     strategy:
@@ -88,18 +88,22 @@ jobs:
           'generic-cxx20',
           'generic-cxx23'
         ]
-        cc: [ 'clang-20' ]
-        cxx: [ 'clang++-20' ]
+        cc: [ 'clang-21' ]
+        cxx: [ 'clang++-21' ]
         include:
           - config: 'generic-gcc-cxx11'
             cc: 'gcc-14'
             cxx: 'g++-14'
-          - config: 'generic-cxx23'
-            cc: 'clang-18'
-            cxx: 'clang++-18'
+          - config: 'generic-cxx26'
+            cc: 'clang-20'
+            cxx: 'clang++-20'
           - config: 'generic-cxx26'
             cc: 'clang-19'
             cxx: 'clang++-19'
+          # Release transition
+          - config: 'generic-cxx23'
+            cc: 'clang-18'
+            cxx: 'clang++-18'
     steps:
       - uses: actions/checkout@v4
       - name: ${{ matrix.config }}
@@ -163,14 +167,14 @@ jobs:
         - config: 'generic-msan'
           machine: libcxx-self-hosted-linux
     runs-on: ${{ matrix.machine }}
-    container: ghcr.io/llvm/libcxx-linux-builder:d8a0709b1090350a7fe3604d8ab78c7d62f10698
+    container: ghcr.io/llvm/libcxx-linux-builder:b319dfef21f6c7b0bc6a356d6b9f41a3b3b98ae9
     steps:
       - uses: actions/checkout@v4
       - name: ${{ matrix.config }}
         run: libcxx/utils/ci/run-buildbot ${{ matrix.config }}
         env:
-          CC: clang-20
-          CXX: clang++-20
+          CC: clang-21
+          CXX: clang++-21
       - uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         if: always()
         with:
diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h
index 648dae2838e03..2fb9d5888bce4 100644
--- a/clang/include/clang/AST/DeclBase.h
+++ b/clang/include/clang/AST/DeclBase.h
@@ -492,7 +492,7 @@ class alignas(8) Decl {
   /// perform non-Decl specific checks based on the object's type and strict
   /// flex array level.
   static bool isFlexibleArrayMemberLike(
-      ASTContext &Context, const Decl *D, QualType Ty,
+      const ASTContext &Context, const Decl *D, QualType Ty,
       LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel,
       bool IgnoreTemplateOrMacroSubstitution);
 
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index cd584d9621a22..ff4f236c1fa88 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -542,7 +542,7 @@ class Expr : public ValueStmt {
   /// When IgnoreTemplateOrMacroSubstitution is set, it doesn't consider sizes
   /// resulting from the substitution of a macro or a template as special sizes.
   bool isFlexibleArrayMemberLike(
-      ASTContext &Context,
+      const ASTContext &Context,
       LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel,
       bool IgnoreTemplateOrMacroSubstitution = false) const;
 
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 7c70534388b4c..4a791316a6269 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3898,7 +3898,8 @@ QualType ASTContext::getArrayParameterType(QualType Ty) const {
   if (Ty->isArrayParameterType())
     return Ty;
   assert(Ty->isConstantArrayType() && "Ty must be an array type.");
-  const auto *ATy = cast<ConstantArrayType>(Ty.getDesugaredType(*this));
+  QualType DTy = Ty.getDesugaredType(*this);
+  const auto *ATy = cast<ConstantArrayType>(DTy);
   llvm::FoldingSetNodeID ID;
   ATy->Profile(ID, *this, ATy->getElementType(), ATy->getZExtSize(),
                ATy->getSizeExpr(), ATy->getSizeModifier(),
@@ -3910,7 +3911,7 @@ QualType ASTContext::getArrayParameterType(QualType Ty) const {
     return QualType(AT, 0);
 
   QualType Canonical;
-  if (!Ty.isCanonical()) {
+  if (!DTy.isCanonical()) {
     Canonical = getArrayParameterType(getCanonicalType(Ty));
 
     // Get the new insert position for the node we care about.
diff --git a/clang/lib/AST/ByteCode/Descriptor.cpp b/clang/lib/AST/ByteCode/Descriptor.cpp
index 319d1690c1cd0..6017f6dd61cb3 100644
--- a/clang/lib/AST/ByteCode/Descriptor.cpp
+++ b/clang/lib/AST/ByteCode/Descriptor.cpp
@@ -237,8 +237,7 @@ static void moveRecord(Block *B, std::byte *Src, std::byte *Dst,
   assert(D);
   assert(D->ElemRecord);
 
-  // FIXME: There might be cases where we need to move over the (v)bases as
-  // well.
+  // FIXME: Code duplication.
   for (const auto &F : D->ElemRecord->fields()) {
     auto FieldOffset = F.Offset;
     const auto *SrcDesc =
@@ -250,6 +249,26 @@ static void moveRecord(Block *B, std::byte *Src, std::byte *Dst,
     if (auto Fn = F.Desc->MoveFn)
       Fn(B, Src + FieldOffset, Dst + FieldOffset, F.Desc);
   }
+
+  for (const auto &Base : D->ElemRecord->bases()) {
+    auto BaseOffset = Base.Offset;
+    const auto *SrcDesc =
+        reinterpret_cast<const InlineDescriptor *>(Src + BaseOffset) - 1;
+    auto *DestDesc = reinterpret_cast<InlineDescriptor *>(Dst + BaseOffset) - 1;
+    std::memcpy(DestDesc, SrcDesc, sizeof(InlineDescriptor));
+
+    if (auto Fn = Base.Desc->MoveFn)
+      Fn(B, Src + BaseOffset, Dst + BaseOffset, Base.Desc);
+  }
+
+  for (const auto &VBase : D->ElemRecord->virtual_bases()) {
+    auto VBaseOffset = VBase.Offset;
+    const auto *SrcDesc =
+        reinterpret_cast<const InlineDescriptor *>(Src + VBaseOffset) - 1;
+    auto *DestDesc =
+        reinterpret_cast<InlineDescriptor *>(Dst + VBaseOffset) - 1;
+    std::memcpy(DestDesc, SrcDesc, sizeof(InlineDescriptor));
+  }
 }
 
 static BlockCtorFn getCtorPrim(PrimType Type) {
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index fc16448cf9e90..ab9d4869a74ee 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -438,7 +438,7 @@ bool Decl::isFileContextDecl() const {
 }
 
 bool Decl::isFlexibleArrayMemberLike(
-    ASTContext &Ctx, const Decl *D, QualType Ty,
+    const ASTContext &Ctx, const Decl *D, QualType Ty,
     LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel,
     bool IgnoreTemplateOrMacroSubstitution) {
   // For compatibility with existing code, we treat arrays of length 0 or
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index 6f570139630d8..1f949d495f343 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -203,7 +203,7 @@ bool Expr::isKnownToHaveBooleanValue(bool Semantic) const {
 }
 
 bool Expr::isFlexibleArrayMemberLike(
-    ASTContext &Ctx,
+    const ASTContext &Ctx,
     LangOptions::StrictFlexArraysLevelKind StrictFlexArraysLevel,
     bool IgnoreTemplateOrMacroSubstitution) const {
   const Expr *E = IgnoreParens();
diff --git a/clang/lib/AST/TextNodeDumper.cpp b/clang/lib/AST/TextNodeDumper.cpp
index 08efcda46b8f9..26493caa5d06a 100644
--- a/clang/lib/AST/TextNodeDumper.cpp
+++ b/clang/lib/AST/TextNodeDumper.cpp
@@ -2189,6 +2189,11 @@ void TextNodeDumper::VisitEnumDecl(const EnumDecl *D) {
     OS << " __module_private__";
   if (D->isFixed())
     dumpType(D->getIntegerType());
+
+  if (const auto *Instance = D->getInstantiatedFromMemberEnum()) {
+    OS << " instantiated_from";
+    dumpPointer(Instance);
+  }
 }
 
 void TextNodeDumper::VisitRecordDecl(const RecordDecl *D) {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index d57f491a20c8e..348cb523b1718 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -20723,9 +20723,19 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_bitop3_b16:
     return emitBuiltinWithOneOverloadedType<4>(*this, E,
                                                Intrinsic::amdgcn_bitop3);
-  case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc:
-    return emitBuiltinWithOneOverloadedType<4>(
-        *this, E, Intrinsic::amdgcn_make_buffer_rsrc);
+  case AMDGPU::BI__builtin_amdgcn_make_buffer_rsrc: {
+    // TODO: LLVM has this overloaded to allow for fat pointers, but since
+    // those haven't been plumbed through to Clang yet, default to creating the
+    // resource type.
+    SmallVector<Value *, 4> Args;
+    for (unsigned I = 0; I < 4; ++I)
+      Args.push_back(EmitScalarExpr(E->getArg(I)));
+    llvm::PointerType *RetTy = llvm::PointerType::get(
+        Builder.getContext(), llvm::AMDGPUAS::BUFFER_RESOURCE);
+    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_make_buffer_rsrc,
+                                   {RetTy, Args[0]->getType()});
+    return Builder.CreateCall(F, Args);
+  }
   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b8:
   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b16:
   case AMDGPU::BI__builtin_amdgcn_raw_buffer_store_b32:
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 9f7db25a15bec..c2289985e9519 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -8879,7 +8879,7 @@ static void emitOffloadingArraysAndArgs(
   };
 
   auto CustomMapperCB = [&](unsigned int I) {
-    llvm::Value *MFunc = nullptr;
+    llvm::Function *MFunc = nullptr;
     if (CombinedInfo.Mappers[I]) {
       Info.HasMapper = true;
       MFunc = CGM.getOpenMPRuntime().getOrCreateUserDefinedMapperFunc(
@@ -8887,9 +8887,9 @@ static void emitOffloadingArraysAndArgs(
     }
     return MFunc;
   };
-  OMPBuilder.emitOffloadingArraysAndArgs(
-      AllocaIP, CodeGenIP, Info, Info.RTArgs, CombinedInfo, IsNonContiguous,
-      ForEndCall, DeviceAddrCB, CustomMapperCB);
+  cantFail(OMPBuilder.emitOffloadingArraysAndArgs(
+      AllocaIP, CodeGenIP, Info, Info.RTArgs, CombinedInfo, CustomMapperCB,
+      IsNonContiguous, ForEndCall, DeviceAddrCB));
 }
 
 /// Check for inner distribute directive.
@@ -9082,15 +9082,15 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D,
     return CombinedInfo;
   };
 
-  auto CustomMapperCB = [&](unsigned I, llvm::Function **MapperFunc) {
+  auto CustomMapperCB = [&](unsigned I) {
+    llvm::Function *MapperFunc = nullptr;
     if (CombinedInfo.Mappers[I]) {
       // Call the corresponding mapper function.
-      *MapperFunc = getOrCreateUserDefinedMapperFunc(
+      MapperFunc = getOrCreateUserDefinedMapperFunc(
           cast<OMPDeclareMapperDecl>(CombinedInfo.Mappers[I]));
-      assert(*MapperFunc && "Expect a valid mapper function is available.");
-      return true;
+      assert(MapperFunc && "Expect a valid mapper function is available.");
     }
-    return false;
+    return MapperFunc;
   };
 
   SmallString<64> TyStr;
@@ -9098,8 +9098,8 @@ void CGOpenMPRuntime::emitUserDefinedMapper(const OMPDeclareMapperDecl *D,
   CGM.getCXXABI().getMangleContext().mangleCanonicalTypeName(Ty, Out);
   std::string Name = getName({"omp_mapper", TyStr, D->getName()});
 
-  auto *NewFn = OMPBuilder.emitUserDefinedMapper(PrivatizeAndGenMapInfoCB,
-                                                 ElemTy, Name, CustomMapperCB);
+  llvm::Function *NewFn = cantFail(OMPBuilder.emitUserDefinedMapper(
+      PrivatizeAndGenMapInfoCB, ElemTy, Name, CustomMapperCB));
   UDMMap.try_emplace(D, NewFn);
   if (CGF)
     FunctionUDMMap[CGF->CurFn].push_back(D);
@@ -10073,7 +10073,7 @@ void CGOpenMPRuntime::emitTargetDataCalls(
   };
 
   auto CustomMapperCB = [&](unsigned int I) {
-    llvm::Value *MFunc = nullptr;
+    llvm::Function *MFunc = nullptr;
     if (CombinedInfo.Mappers[I]) {
       Info.HasMapper = true;
       MFunc = CGF.CGM.getOpenMPRuntime().getOrCreateUserDefinedMapperFunc(
@@ -10093,7 +10093,8 @@ void CGOpenMPRuntime::emitTargetDataCalls(
   llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
       cantFail(OMPBuilder.createTargetData(
           OmpLoc, AllocaIP, CodeGenIP, DeviceID, IfCondVal, Info, GenMapInfoCB,
-          /*MapperFunc=*/nullptr, BodyCB, DeviceAddrCB, CustomMapperCB, RTLoc));
+          CustomMapperCB,
+          /*MapperFunc=*/nullptr, BodyCB, DeviceAddrCB, RTLoc));
   CGF.Builder.restoreIP(AfterIP);
 }
 
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 3cc3210841e0f..a13f30cd23119 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1715,3 +1715,22 @@ namespace IgnoredMemberExpr {
   };
   static_assert(B{}.foo() == 0, "");
 }
+
+#if __cplusplus >= 202002L
+namespace DeadUpcast {
+  struct A {};
+  struct B : A{};
+  constexpr bool foo() {
+
+    B *pb;
+    {
+      B b;
+      pb = &b;
+    }
+    A *pa = pb;
+
+    return true;
+  }
+  static_assert(foo(), "");
+}
+#endif
diff --git a/clang/test/AST/HLSL/TypdefArrayParam.hlsl b/clang/test/AST/HLSL/TypdefArrayParam.hlsl
index c6ae168f84064..37f7a66de23a1 100644
--- a/clang/test/AST/HLSL/TypdefArrayParam.hlsl
+++ b/clang/test/AST/HLSL/TypdefArrayParam.hlsl
@@ -55,3 +55,14 @@ void call2() {
   uint4 C[2] = {A,A};
   uint32_t D = Accumulate(C);
 }
+
+typedef int Foo[2];
+
+// CHECK-LABEL: call3
+// CHECK: ArraySubscriptExpr {{.*}} 'int' lvalue
+// CHECK-NEXT: ImplicitCastExpr {{.*}} 'int *' <ArrayToPointerDecay>
+// CHECK-NEXT: DeclRefExpr {{.*}} 'int[2]' lvalue ParmVar {{.*}} 'F' 'int[2]'
+// CHECK-NEXT: IntegerLiteral {{.*}} 'int' 0
+int call3(Foo F) {
+  return F[0];
+}
diff --git a/clang/test/AST/ast-dump-decl.cpp b/clang/test/AST/ast-dump-decl.cpp
index e84241cee922f..d79051fb6efaa 100644
--- a/clang/test/AST/ast-dump-decl.cpp
+++ b/clang/test/AST/ast-dump-decl.cpp
@@ -15,7 +15,9 @@ class testEnumDecl {
   enum TestEnumDeclFixed : int;
 };
 // CHECK: EnumDecl{{.*}} class TestEnumDeclScoped 'int'
+// CHECK-NOT: instantiated_from
 // CHECK: EnumDecl{{.*}} TestEnumDeclFixed 'int'
+// CHECK-NOT: instantiated_from
 
 class testFieldDecl {
   int TestFieldDeclInit = 0;
@@ -328,9 +330,9 @@ namespace testClassTemplateDecl {
 // CHECK-NEXT:  | | `-Destructor irrelevant non_trivial user_declared{{$}}
 // CHECK-NEXT:  | |-CXXRecordDecl 0x{{.+}} <col:24, col:30> col:30 implicit referenced class TestClassTemplate{{$}}
 // CHECK-NEXT:  | |-AccessSpecDecl 0x{{.+}} <line:[[@LINE-50]]:3, col:9> col:3 public{{$}}
-// CHECK-NEXT:  | |-CXXConstructorDecl 0x{{.+}} <line:[[@LINE-50]]:5, col:23> col:5 TestClassTemplate<T> 'void ()'{{$}}
-// CHECK-NEXT:  | |-CXXDestructorDecl 0x{{.+}} <line:[[@LINE-50]]:5, col:24> col:5 ~TestClassTemplate<T> 'void ()' not_selected{{$}}
-// CHECK-NEXT:  | |-CXXMethodDecl 0x{{.+}} <line:[[@LINE-50]]:5, col:11> col:9 j 'int ()'{{$}}
+// CHECK-NEXT:  | |-CXXConstructorDecl 0x[[#%x,TEMPLATE_CONSTRUCTOR_DECL:]] <line:[[@LINE-50]]:5, col:23> col:5 TestClassTemplate<T> 'void ()'{{$}}
+// CHECK-NEXT:  | |-CXXDestructorDecl 0x[[#%x,TEMPLATE_DESTRUCTOR_DECL:]] <line:[[@LINE-50]]:5, col:24> col:5 ~TestClassTemplate<T> 'void ()' not_selected{{$}}
+// CHECK-NEXT:  | |-CXXMethodDecl 0x[[#%x,TEMPLATE_METHOD_DECL:]] <line:[[@LINE-50]]:5, col:11> col:9 j 'int ()'{{$}}
 // CHECK-NEXT:  | `-FieldDecl 0x{{.+}} <line:[[@LINE-50]]:5, col:9> col:9 i 'int'{{$}}
 // CHECK-NEXT:  |-ClassTemplateSpecializationDecl 0x{{.+}} <line:[[@LINE-56]]:3, line:[[@LINE-50]]:3> line:[[@LINE-56]]:30 class TestClassTemplate definition implicit_instantiation{{$}}
 // CHECK-NEXT:  | |-DefinitionData standard_layout has_user_declared_ctor can_const_default_init{{$}}
@@ -345,9 +347,9 @@ namespace testClassTemplateDecl {
 // CHECK-NEXT:  | |   `-CXXRecord 0x{{.+}} 'A'{{$}}
 // CHECK-NEXT:  | |-CXXRecordDecl 0x{{.+}} <col:24, col:30> col:30 implicit class TestClassTemplate{{$}}
 // CHECK-NEXT:  | |-AccessSpecDecl 0x{{.+}} <line:[[@LINE-67]]:3, col:9> col:3 public{{$}}
-// CHECK-NEXT:  | |-CXXConstructorDecl 0x{{.+}} <line:[[@LINE-67]]:5, col:23> col:5 used TestClassTemplate 'void ()' implicit_instantiation instantiated_from {{0x[^ ]+}}{{$}}
-// CHECK-NEXT:  | |-CXXDestructorDecl 0x{{.+}} <line:[[@LINE-67]]:5, col:24> col:5 used ~TestClassTemplate 'void () noexcept' implicit_instantiation instantiated_from {{0x[^ ]+}}{{$}}
-// CHECK-NEXT:  | |-CXXMethodDecl 0x{{.+}} <line:[[@LINE-67]]:5, col:11> col:9 j 'int ()' implicit_instantiation instantiated_from {{0x[^ ]+}}{{$}}
+// CHECK-NEXT:  | |-CXXConstructorDecl 0x{{.+}} <line:[[@LINE-67]]:5, col:23> col:5 used TestClassTemplate 'void ()' implicit_instantiation instantiated_from 0x[[#TEMPLATE_CONSTRUCTOR_DECL]]{{$}}
+// CHECK-NEXT:  | |-CXXDestructorDecl 0x{{.+}} <line:[[@LINE-67]]:5, col:24> col:5 used ~TestClassTemplate 'void () noexcept' implicit_instantiation instantiated_from 0x[[#TEMPLATE_DESTRUCTOR_DECL]]{{$}}
+// CHECK-NEXT:  | |-CXXMethodDecl 0x{{.+}} <line:[[@LINE-67]]:5, col:11> col:9 j 'int ()' implicit_instantiation instantiated_from 0x[[#TEMPLATE_METHOD_DECL]]{{$}}
 // CHECK-NEXT:  | |-FieldDecl 0x{{.+}} <line:[[@LINE-67]]:5, col:9> col:9 i 'int'{{$}}
 // CHECK-NEXT:  | `-CXXConstructorDecl 0x{{.+}} <line:[[@LINE-73]]:30> col:30 implicit constexpr TestClassTemplate 'void (const TestClassTemplate<A> &)' inline default trivial noexcept-unevaluated 0x{{.+}}{{$}}
 // CHECK-NEXT:  |   `-ParmVarDecl 0x{{.+}} <col:30> col:30 'const TestClassTemplate<A> &'{{$}}
@@ -487,6 +489,109 @@ namespace testClassTemplateDecl {
 // CHECK-NEXT:    `-CXXRecordDecl 0x{{.+}} <col:41, col:48> col:48 implicit struct TestTemplateTemplateDefaultType{{$}}
 
 
+namespace testClassTemplateDecl {
+  template<typename T> struct TestClassTemplateWithScopedMemberEnum {
+    enum class E1 : T { A, B, C, D };
+    enum class E2 : int { A, B, C, D };
+    enum class E3 : T;
+    enum class E4 : int;
+  };
+
+  template struct TestClassTemplateWithScopedMemberEnum<unsigned>;
+
+  TestClassTemplateWithScopedMemberEnum<int> TestClassTemplateWithScopedMemberEnumObject;
+}
+
+// CHECK:      ClassTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-12]]:3, line:[[@LINE-7]]:3> line:[[@LINE-12]]:31 TestClassTemplateWithScopedMemberEnum
+// CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <col:12, col:21> col:21 referenced typename depth 0 index 0 T
+// CHECK-NEXT: |-CXXRecordDecl 0x{{.+}} <col:24, line:[[@LINE-9]]:3> line:[[@LINE-14]]:31 struct TestClassTemplateWithScopedMemberEnum definition
+// CHECK:      | |-EnumDecl 0x[[#%x,SCOPED_MEMBER_ENUM_E1:]] <line:[[@LINE-14]]:5, col:36> col:16 class E1 'T'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:25> col:25 A 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E1'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:28> col:28 B 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E1'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:31> col:31 C 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E1'
+// CHECK-NEXT: | | `-EnumConstantDecl 0x{{.+}} <col:34> col:34 D 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E1'
+// CHECK-NEXT: | |-EnumDecl 0x[[#%x,SCOPED_MEMBER_ENUM_E2:]] <line:[[@LINE-18]]:5, col:38> col:16 class E2 'int'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:27> col:27 A 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E2'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:30> col:30 B 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E2'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:33> col:33 C 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E2'
+// CHECK-NEXT: | | `-EnumConstantDecl 0x{{.+}} <col:36> col:36 D 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum::E2'
+// CHECK-NEXT: | |-EnumDecl 0x[[#%x,SCOPED_MEMBER_ENUM_E3:]] <line:[[@LINE-22]]:5, col:21> col:16 class E3 'T'
+// CHECK-NEXT: | `-EnumDecl 0x[[#%x,SCOPED_MEMBER_ENUM_E4:]] <line:[[@LINE-22]]:5, col:21> col:16 class E4 'int'
+// CHECK-NEXT: |-ClassTemplateSpecialization 0x{{.+}} 'TestClassTemplateWithScopedMemberEnum'
+// CHECK-NEXT: `-ClassTemplateSpecializationDecl 0x{{.+}} <line:[[@LINE-28]]:3, line:[[@LINE-23]]:3> line:[[@LINE-28]]:31 struct TestClassTemplateWithScopedMemberEnum definition implicit_instantiation
+// CHECK:        |-TemplateArgument type 'int'
+// CHECK-NEXT:   | `-BuiltinType 0x{{.+}} 'int'
+// CHECK:        |-EnumDecl 0x{{.+}} <line:[[@LINE-30]]:5, col:21> col:16 class E1 'int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E1]]{{$}}
+// CHECK-NEXT:   |-EnumDecl 0x{{.+}} <line:[[@LINE-30]]:5, col:21> col:16 class E2 'int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E2]]{{$}}
+// CHECK-NEXT:   |-EnumDecl 0x{{.+}} <line:[[@LINE-30]]:5, col:21> col:16 class E3 'int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E3]]{{$}}
+// CHECK-NEXT:   |-EnumDecl 0x{{.+}} <line:[[@LINE-30]]:5, col:21> col:16 class E4 'int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E4]]{{$}}
+
+// CHECK:      ClassTemplateSpecializationDecl 0x{{.+}} <{{.+}}:[[@LINE-29]]:3, col:65> col:19 struct TestClassTemplateWithScopedMemberEnum definition explicit_instantiation_definition
+// CHECK:      |-TemplateArgument type 'unsigned int'
+// CHECK-NEXT: | `-BuiltinType 0x{{.+}} 'unsigned int'
+// CHECK:      |-EnumDecl 0x{{.+}} <line:[[@LINE-38]]:5, col:21> col:16 class E1 'unsigned int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E1]]{{$}}
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:25> col:25 A 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:28> col:28 B 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:31> col:31 C 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: | `-EnumConstantDecl 0x{{.+}} <col:34> col:34 D 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: |-EnumDecl 0x{{.+}} <line:[[@LINE-42]]:5, col:21> col:16 class E2 'int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E2]]{{$}}
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:27> col:27 A 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:30> col:30 B 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:33> col:33 C 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: | `-EnumConstantDecl 0x{{.+}} <col:36> col:36 D 'testClassTemplateDecl::TestClassTemplateWithScopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: |-EnumDecl 0x{{.+}} <line:[[@LINE-46]]:5, col:21> col:16 class E3 'unsigned int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E3]]{{$}}
+// CHECK-NEXT: `-EnumDecl 0x{{.+}} <line:[[@LINE-46]]:5, col:21> col:16 class E4 'int' instantiated_from 0x[[#SCOPED_MEMBER_ENUM_E4]]{{$}}
+
+
+
+
+namespace testClassTemplateDecl {
+  template<typename T> struct TestClassTemplateWithUnscopedMemberEnum {
+    enum E1 : T { E1_A, E1_B, E1_C, E1_D };
+    enum E2 : int { E2_A, E2_B, E2_C, E2_D };
+    enum E3 : T;
+    enum E4 : int;
+  };
+
+  template struct TestClassTemplateWithUnscopedMemberEnum<unsigned>;
+
+  TestClassTemplateWithUnscopedMemberEnum<unsigned> TestClassTemplateWithUnscopedMemberEnumObject;
+}
+
+// CHECK:      ClassTemplateDecl 0x{{.+}} <{{.+}}:[[@LINE-12]]:3, line:[[@LINE-7]]:3> line:[[@LINE-12]]:31 TestClassTemplateWithUnscopedMemberEnum
+// CHECK-NEXT: |-TemplateTypeParmDecl 0x{{.+}} <col:12, col:21> col:21 referenced typename depth 0 index 0 T
+// CHECK-NEXT: |-CXXRecordDecl 0x{{.+}} <col:24, line:[[@LINE-9]]:3> line:[[@LINE-14]]:31 struct TestClassTemplateWithUnscopedMemberEnum definition
+// CHECK:      | |-EnumDecl 0x[[#%x,UNSCOPED_MEMBER_ENUM_E1:]] <line:[[@LINE-14]]:5, col:42> col:10 E1 'T'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:19> col:19 E1_A 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E1'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:25> col:25 E1_B 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E1'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:31> col:31 E1_C 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E1'
+// CHECK-NEXT: | | `-EnumConstantDecl 0x{{.+}} <col:37> col:37 E1_D 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E1'
+// CHECK-NEXT: | |-EnumDecl 0x[[#%x,UNSCOPED_MEMBER_ENUM_E2:]] <line:[[@LINE-18]]:5, col:44> col:10 E2 'int'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:21> col:21 E2_A 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E2'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:27> col:27 E2_B 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E2'
+// CHECK-NEXT: | | |-EnumConstantDecl 0x{{.+}} <col:33> col:33 E2_C 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E2'
+// CHECK-NEXT: | | `-EnumConstantDecl 0x{{.+}} <col:39> col:39 E2_D 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum::E2'
+// CHECK-NEXT: | |-EnumDecl 0x[[#%x,UNSCOPED_MEMBER_ENUM_E3:]] <line:[[@LINE-22]]:5, col:15> col:10 E3 'T'
+// CHECK-NEXT: | `-EnumDecl 0x[[#%x,UNSCOPED_MEMBER_ENUM_E4:]] <line:[[@LINE-22]]:5, col:15> col:10 E4 'int'
+// CHECK-NEXT: `-ClassTemplateSpecialization {{.+}} 'TestClassTemplateWithUnscopedMemberEnum'
+
+// CHECK:      ClassTemplateSpecializationDecl 0x{{.+}} <{{.+}}:[[@LINE-22]]:3, col:67> col:19 struct TestClassTemplateWithUnscopedMemberEnum definition explicit_instantiation_definition
+// CHECK:      |-TemplateArgument type 'unsigned int'
+// CHECK-NEXT: | `-BuiltinType 0x{{.+}} 'unsigned int'
+// CHECK:      |-EnumDecl 0x{{.+}} <line:[[@LINE-31]]:5, col:15> col:10 E1 'unsigned int' instantiated_from 0x[[#UNSCOPED_MEMBER_ENUM_E1]]{{$}}
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:19> col:19 E1_A 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:25> col:25 E1_B 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:31> col:31 E1_C 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: | `-EnumConstantDecl 0x{{.+}} <col:37> col:37 E1_D 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E1'
+// CHECK-NEXT: |-EnumDecl 0x{{.+}} <line:[[@LINE-35]]:5, col:15> col:10 E2 'int' instantiated_from 0x[[#UNSCOPED_MEMBER_ENUM_E2]]{{$}}
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:21> col:21 E2_A 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:27> col:27 E2_B 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: | |-EnumConstantDecl 0x{{.+}} <col:33> col:33 E2_C 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: | `-EnumConstantDecl 0x{{.+}} <col:39> col:39 E2_D 'testClassTemplateDecl::TestClassTemplateWithUnscopedMemberEnum<unsigned int>::E2'
+// CHECK-NEXT: |-EnumDecl 0x{{.+}} <line:[[@LINE-39]]:5, col:15> col:10 E3 'unsigned int' instantiated_from 0x[[#UNSCOPED_MEMBER_ENUM_E3]]{{$}}
+// CHECK-NEXT: |-EnumDecl 0x{{.+}} <line:[[@LINE-39]]:5, col:15> col:10 E4 'int' instantiated_from 0x[[#UNSCOPED_MEMBER_ENUM_E4]]{{$}}
+
+
 // PR15220 dump instantiation only once
 namespace testCanonicalTemplate {
   class A {};
diff --git a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
index c1a30633f3d0a..2342fcefb5f89 100644
--- a/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
+++ b/clang/test/CodeGenHIP/builtins-make-buffer-rsrc.hip
@@ -25,7 +25,7 @@
 // CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 [[TMP3]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP4]]
 //
 __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -49,7 +49,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 4, i32 [[TMP1]], i32 [[TMP2]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP3]]
 //
 __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -73,7 +73,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constan
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[FLAGS_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
+// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 1234, i32 [[TMP2]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP3]]
 //
 __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -97,7 +97,7 @@ __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(v
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P_ADDR_ASCAST]], align 8
 // CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[STRIDE_ADDR_ASCAST]], align 2
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[NUM_ADDR_ASCAST]], align 4
-// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
+// CHECK-NEXT:    [[TMP3:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[TMP0]], i16 [[TMP1]], i32 [[TMP2]], i32 5678)
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP3]]
 //
 __device__ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
index 2c7bc10fb609c..29093c09c39d0 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-make-buffer-rsrc.cl
@@ -4,7 +4,7 @@
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, int num, int flags) {
@@ -13,7 +13,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0(void *p, short stride, in
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_stride_constant(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p, int num, int flags) {
@@ -22,7 +22,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_stride_constant(void *p,
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_num_constant(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, short stride, int flags) {
@@ -31,7 +31,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_num_constant(void *p, sho
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p0_flags_constant(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, short stride, int num) {
@@ -40,7 +40,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p0_flags_constant(void *p, s
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short stride, int num, int flags) {
@@ -49,7 +49,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1(global void *p, short str
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_stride_constant(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 4, i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global void *p, int num, int flags) {
@@ -58,7 +58,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_stride_constant(global vo
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_num_constant(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 1234, i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void *p, short stride, int flags) {
@@ -67,7 +67,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_num_constant(global void
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_rsrc_p1_flags_constant(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P:%.*]], i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 5678)
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global void *p, short stride, int num) {
@@ -76,7 +76,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_rsrc_p1_flags_constant(global voi
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_p0_nullptr(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num, int flags) {
@@ -85,7 +85,7 @@ __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p0_nullptr(short stride, int num,
 
 // CHECK-LABEL: @test_amdgcn_make_buffer_p1_nullptr(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) null, i16 [[STRIDE:%.*]], i32 [[NUM:%.*]], i32 [[FLAGS:%.*]])
 // CHECK-NEXT:    ret ptr addrspace(8) [[TMP0]]
 //
 __amdgpu_buffer_rsrc_t test_amdgcn_make_buffer_p1_nullptr(short stride, int num, int flags) {
diff --git a/flang/include/flang/Lower/AbstractConverter.h b/flang/include/flang/Lower/AbstractConverter.h
index 3d2b805da6f47..1d1323642bf9c 100644
--- a/flang/include/flang/Lower/AbstractConverter.h
+++ b/flang/include/flang/Lower/AbstractConverter.h
@@ -314,6 +314,8 @@ class AbstractConverter {
   mangleName(const Fortran::semantics::DerivedTypeSpec &) = 0;
   /// Unique a compiler generated name (add a containing scope specific prefix)
   virtual std::string mangleName(std::string &) = 0;
+  /// Unique a compiler generated name (add a provided scope specific prefix)
+  virtual std::string mangleName(std::string &, const semantics::Scope &) = 0;
   /// Return the field name for a derived type component inside a fir.record
   /// type.
   virtual std::string
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 36e58e456dea3..7c217ce2f404c 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1049,6 +1049,11 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     return Fortran::lower::mangle::mangleName(name, getCurrentScope(),
                                               scopeBlockIdMap);
   }
+  std::string
+  mangleName(std::string &name,
+             const Fortran::semantics::Scope &myScope) override final {
+    return Fortran::lower::mangle::mangleName(name, myScope, scopeBlockIdMap);
+  }
   std::string getRecordTypeFieldName(
       const Fortran::semantics::Symbol &component) override final {
     return Fortran::lower::mangle::getRecordTypeFieldName(component,
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
index febc6adcf9d6f..e21d299570b86 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.cpp
@@ -969,8 +969,11 @@ void ClauseProcessor::processMapObjects(
     llvm::omp::OpenMPOffloadMappingFlags mapTypeBits,
     std::map<Object, OmpMapParentAndMemberData> &parentMemberIndices,
     llvm::SmallVectorImpl<mlir::Value> &mapVars,
-    llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms) const {
+    llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
+    llvm::StringRef mapperIdNameRef) const {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  mlir::FlatSymbolRefAttr mapperId;
+  std::string mapperIdName = mapperIdNameRef.str();
 
   for (const omp::Object &object : objects) {
     llvm::SmallVector<mlir::Value> bounds;
@@ -1003,6 +1006,20 @@ void ClauseProcessor::processMapObjects(
       }
     }
 
+    if (!mapperIdName.empty()) {
+      if (mapperIdName == "default") {
+        auto &typeSpec = object.sym()->owner().IsDerivedType()
+                             ? *object.sym()->owner().derivedTypeSpec()
+                             : object.sym()->GetType()->derivedTypeSpec();
+        mapperIdName = typeSpec.name().ToString() + ".default";
+        mapperIdName = converter.mangleName(mapperIdName, *typeSpec.GetScope());
+      }
+      assert(converter.getModuleOp().lookupSymbol(mapperIdName) &&
+             "mapper not found");
+      mapperId = mlir::FlatSymbolRefAttr::get(&converter.getMLIRContext(),
+                                              mapperIdName);
+      mapperIdName.clear();
+    }
     // Explicit map captures are captured ByRef by default,
     // optimisation passes may alter this to ByCopy or other capture
     // types to optimise
@@ -1016,7 +1033,8 @@ void ClauseProcessor::processMapObjects(
         static_cast<
             std::underlying_type_t<llvm::omp::OpenMPOffloadMappingFlags>>(
             mapTypeBits),
-        mlir::omp::VariableCaptureKind::ByRef, baseOp.getType());
+        mlir::omp::VariableCaptureKind::ByRef, baseOp.getType(),
+        /*partialMap=*/false, mapperId);
 
     if (parentObj.has_value()) {
       parentMemberIndices[parentObj.value()].addChildIndexAndMapToParent(
@@ -1047,6 +1065,7 @@ bool ClauseProcessor::processMap(
     const auto &[mapType, typeMods, mappers, iterator, objects] = clause.t;
     llvm::omp::OpenMPOffloadMappingFlags mapTypeBits =
         llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE;
+    std::string mapperIdName;
     // If the map type is specified, then process it else Tofrom is the
     // default.
     Map::MapType type = mapType.value_or(Map::MapType::Tofrom);
@@ -1090,13 +1109,17 @@ bool ClauseProcessor::processMap(
            "Support for iterator modifiers is not implemented yet");
     }
     if (mappers) {
-      TODO(currentLocation,
-           "Support for mapper modifiers is not implemented yet");
+      assert(mappers->size() == 1 && "more than one mapper");
+      mapperIdName = mappers->front().v.id().symbol->name().ToString();
+      if (mapperIdName != "default")
+        mapperIdName = converter.mangleName(
+            mapperIdName, mappers->front().v.id().symbol->owner());
     }
 
     processMapObjects(stmtCtx, clauseLocation,
                       std::get<omp::ObjectList>(clause.t), mapTypeBits,
-                      parentMemberIndices, result.mapVars, *ptrMapSyms);
+                      parentMemberIndices, result.mapVars, *ptrMapSyms,
+                      mapperIdName);
   };
 
   bool clauseFound = findRepeatableClause<omp::clause::Map>(process);
diff --git a/flang/lib/Lower/OpenMP/ClauseProcessor.h b/flang/lib/Lower/OpenMP/ClauseProcessor.h
index e05f66c766684..889a09a8f0cd8 100644
--- a/flang/lib/Lower/OpenMP/ClauseProcessor.h
+++ b/flang/lib/Lower/OpenMP/ClauseProcessor.h
@@ -175,7 +175,8 @@ class ClauseProcessor {
       llvm::omp::OpenMPOffloadMappingFlags mapTypeBits,
       std::map<Object, OmpMapParentAndMemberData> &parentMemberIndices,
       llvm::SmallVectorImpl<mlir::Value> &mapVars,
-      llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms) const;
+      llvm::SmallVectorImpl<const semantics::Symbol *> &mapSyms,
+      llvm::StringRef mapperIdNameRef = "") const;
 
   lower::AbstractConverter &converter;
   semantics::SemanticsContext &semaCtx;
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index bd794033cdf11..e0d23fc53eeca 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -3119,7 +3119,51 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPDeclareMapperConstruct &declareMapperConstruct) {
-  TODO(converter.getCurrentLocation(), "OpenMPDeclareMapperConstruct");
+  mlir::Location loc = converter.genLocation(declareMapperConstruct.source);
+  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+  lower::StatementContext stmtCtx;
+  const auto &spec =
+      std::get<parser::OmpMapperSpecifier>(declareMapperConstruct.t);
+  const auto &mapperName{std::get<std::optional<parser::Name>>(spec.t)};
+  const auto &varType{std::get<parser::TypeSpec>(spec.t)};
+  const auto &varName{std::get<parser::Name>(spec.t)};
+  assert(varType.declTypeSpec->category() ==
+             semantics::DeclTypeSpec::Category::TypeDerived &&
+         "Expected derived type");
+
+  std::string mapperNameStr;
+  if (mapperName.has_value()) {
+    mapperNameStr = mapperName->ToString();
+    mapperNameStr =
+        converter.mangleName(mapperNameStr, mapperName->symbol->owner());
+  } else {
+    mapperNameStr =
+        varType.declTypeSpec->derivedTypeSpec().name().ToString() + ".default";
+    mapperNameStr = converter.mangleName(
+        mapperNameStr, *varType.declTypeSpec->derivedTypeSpec().GetScope());
+  }
+
+  // Save current insertion point before moving to the module scope to create
+  // the DeclareMapperOp
+  mlir::OpBuilder::InsertionGuard guard(firOpBuilder);
+
+  firOpBuilder.setInsertionPointToStart(converter.getModuleOp().getBody());
+  auto mlirType = converter.genType(varType.declTypeSpec->derivedTypeSpec());
+  auto declMapperOp = firOpBuilder.create<mlir::omp::DeclareMapperOp>(
+      loc, mapperNameStr, mlirType);
+  auto &region = declMapperOp.getRegion();
+  firOpBuilder.createBlock(&region);
+  auto varVal = region.addArgument(firOpBuilder.getRefType(mlirType), loc);
+  converter.bindSymbol(*varName.symbol, varVal);
+
+  // Populate the declareMapper region with the map information.
+  mlir::omp::DeclareMapperInfoOperands clauseOps;
+  const auto *clauseList{
+      parser::Unwrap<parser::OmpClauseList>(declareMapperConstruct.t)};
+  List<Clause> clauses = makeClauses(*clauseList, semaCtx);
+  ClauseProcessor cp(converter, semaCtx, clauses);
+  cp.processMap(loc, stmtCtx, clauseOps);
+  firOpBuilder.create<mlir::omp::DeclareMapperInfoOp>(loc, clauseOps.mapVars);
 }
 
 static void
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 35722fa7d1b12..fa1975dac789b 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -125,7 +125,7 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 llvm::ArrayRef<mlir::Value> members,
                 mlir::ArrayAttr membersIndex, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
-                bool partialMap) {
+                bool partialMap, mlir::FlatSymbolRefAttr mapperId) {
   if (auto boxTy = llvm::dyn_cast<fir::BaseBoxType>(baseAddr.getType())) {
     baseAddr = builder.create<fir::BoxAddrOp>(loc, baseAddr);
     retTy = baseAddr.getType();
@@ -144,6 +144,7 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
   mlir::omp::MapInfoOp op = builder.create<mlir::omp::MapInfoOp>(
       loc, retTy, baseAddr, varType, varPtrPtr, members, membersIndex, bounds,
       builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
+      mapperId,
       builder.getAttr<mlir::omp::VariableCaptureKindAttr>(mapCaptureType),
       builder.getStringAttr(name), builder.getBoolAttr(partialMap));
   return op;
diff --git a/flang/lib/Lower/OpenMP/Utils.h b/flang/lib/Lower/OpenMP/Utils.h
index f2e378443e5f2..3943eb633b04e 100644
--- a/flang/lib/Lower/OpenMP/Utils.h
+++ b/flang/lib/Lower/OpenMP/Utils.h
@@ -116,7 +116,8 @@ createMapInfoOp(fir::FirOpBuilder &builder, mlir::Location loc,
                 llvm::ArrayRef<mlir::Value> members,
                 mlir::ArrayAttr membersIndex, uint64_t mapType,
                 mlir::omp::VariableCaptureKind mapCaptureType, mlir::Type retTy,
-                bool partialMap = false);
+                bool partialMap = false,
+                mlir::FlatSymbolRefAttr mapperId = mlir::FlatSymbolRefAttr());
 
 void insertChildMapInfoIntoParent(
     Fortran::lower::AbstractConverter &converter,
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index 98e325c307d97..beea7543e54b3 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -184,6 +184,7 @@ class MapInfoFinalizationPass
         /*members=*/mlir::SmallVector<mlir::Value>{},
         /*membersIndex=*/mlir::ArrayAttr{}, bounds,
         builder.getIntegerAttr(builder.getIntegerType(64, false), mapType),
+        /*mapperId*/ mlir::FlatSymbolRefAttr(),
         builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
             mlir::omp::VariableCaptureKind::ByRef),
         /*name=*/builder.getStringAttr(""),
@@ -329,7 +330,8 @@ class MapInfoFinalizationPass
             builder.getIntegerAttr(
                 builder.getIntegerType(64, false),
                 getDescriptorMapType(op.getMapType().value_or(0), target)),
-            op.getMapCaptureTypeAttr(), op.getNameAttr(),
+            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getMapCaptureTypeAttr(),
+            op.getNameAttr(),
             /*partial_map=*/builder.getBoolAttr(false));
     op.replaceAllUsesWith(newDescParentMapOp.getResult());
     op->erase();
@@ -464,7 +466,8 @@ class MapInfoFinalizationPass
     for (auto *user : mapOp->getUsers()) {
       if (llvm::isa<mlir::omp::TargetOp, mlir::omp::TargetDataOp,
                     mlir::omp::TargetUpdateOp, mlir::omp::TargetExitDataOp,
-                    mlir::omp::TargetEnterDataOp>(user))
+                    mlir::omp::TargetEnterDataOp,
+                    mlir::omp::DeclareMapperInfoOp>(user))
         return user;
 
       if (auto mapUser = llvm::dyn_cast<mlir::omp::MapInfoOp>(user))
@@ -497,7 +500,9 @@ class MapInfoFinalizationPass
     // ourselves to the possibility of race conditions while this pass
     // undergoes frequent re-iteration for the near future. So we loop
     // over function in the module and then map.info inside of those.
-    getOperation()->walk([&](mlir::func::FuncOp func) {
+    getOperation()->walk([&](mlir::Operation *func) {
+      if (!mlir::isa<mlir::func::FuncOp, mlir::omp::DeclareMapperOp>(func))
+        return;
       // clear all local allocations we made for any boxes in any prior
       // iterations from previous function scopes.
       localBoxAllocas.clear();
@@ -620,6 +625,7 @@ class MapInfoFinalizationPass
                   /*members=*/mlir::ValueRange{},
                   /*members_index=*/mlir::ArrayAttr{},
                   /*bounds=*/bounds, op.getMapTypeAttr(),
+                  /*mapperId*/ mlir::FlatSymbolRefAttr(),
                   builder.getAttr<mlir::omp::VariableCaptureKindAttr>(
                       mlir::omp::VariableCaptureKind::ByRef),
                   builder.getStringAttr(op.getNameAttr().strref() + "." +
diff --git a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
index 963ae863c1fc5..97ea463a3c495 100644
--- a/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapsForPrivatizedSymbols.cpp
@@ -91,6 +91,7 @@ class MapsForPrivatizedSymbolsPass
         /*bounds=*/ValueRange{},
         builder.getIntegerAttr(builder.getIntegerType(64, /*isSigned=*/false),
                                mapTypeTo),
+        /*mapperId*/ mlir::FlatSymbolRefAttr(),
         builder.getAttr<omp::VariableCaptureKindAttr>(
             omp::VariableCaptureKind::ByRef),
         StringAttr(), builder.getBoolAttr(false));
diff --git a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
index 8e4e1fe824d9f..7cdcd2a10e975 100644
--- a/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
+++ b/flang/test/Fir/convert-to-llvm-openmp-and-fir.fir
@@ -936,9 +936,9 @@ func.func @omp_map_info_descriptor_type_conversion(%arg0 : !fir.ref<!fir.box<!fi
   %1 = omp.map.info var_ptr(%0 : !fir.llvm_ptr<!fir.ref<i32>>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.llvm_ptr<!fir.ref<i32>> {name = ""}
   // CHECK: %[[DESC_MAP:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8)>) map_clauses(always, delete) capture(ByRef) members(%[[MEMBER_MAP]] : [0] : !llvm.ptr) -> !llvm.ptr {name = ""}
   %2 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.box<!fir.heap<i32>>>, !fir.box<!fir.heap<i32>>) map_clauses(always, delete) capture(ByRef) members(%1 : [0] : !fir.llvm_ptr<!fir.ref<i32>>) -> !fir.ref<!fir.box<!fir.heap<i32>>> {name = ""}
-  // CHECK: omp.target_exit_data map_entries(%[[DESC_MAP]] : !llvm.ptr) 
+  // CHECK: omp.target_exit_data map_entries(%[[DESC_MAP]] : !llvm.ptr)
   omp.target_exit_data   map_entries(%2 : !fir.ref<!fir.box<!fir.heap<i32>>>)
-  return 
+  return
 }
 
 // -----
@@ -956,8 +956,8 @@ func.func @omp_map_info_derived_type_explicit_member_conversion(%arg0 : !fir.ref
   %3 = fir.field_index real, !fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>
   %4 = fir.coordinate_of %arg0, %3 : (!fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.field) -> !fir.ref<f32>
   // CHECK: %[[MAP_MEMBER_2:.*]] = omp.map.info var_ptr(%[[GEP_2]] : !llvm.ptr, f32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "dtype%real"}
-  %5 = omp.map.info var_ptr(%4 : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "dtype%real"}    
-  // CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<"_QFderived_type", (f32, array<10 x i32>, i32)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [2], [0] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "dtype", partial_map = true} 
+  %5 = omp.map.info var_ptr(%4 : !fir.ref<f32>, f32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<f32> {name = "dtype%real"}
+  // CHECK: %[[MAP_PARENT:.*]] = omp.map.info var_ptr(%[[ARG_0]] : !llvm.ptr, !llvm.struct<"_QFderived_type", (f32, array<10 x i32>, i32)>) map_clauses(tofrom) capture(ByRef) members(%[[MAP_MEMBER_1]], %[[MAP_MEMBER_2]] : [2], [0] : !llvm.ptr, !llvm.ptr) -> !llvm.ptr {name = "dtype", partial_map = true}
   %6 = omp.map.info var_ptr(%arg0 : !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>, !fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>) map_clauses(tofrom) capture(ByRef) members(%2, %5 : [2], [0] : !fir.ref<i32>, !fir.ref<f32>) -> !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>> {name = "dtype", partial_map = true}
   // CHECK: omp.target map_entries(%[[MAP_MEMBER_1]] -> %[[ARG_1:.*]], %[[MAP_MEMBER_2]] -> %[[ARG_2:.*]], %[[MAP_PARENT]] -> %[[ARG_3:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) {
   omp.target map_entries(%2 -> %arg1, %5 -> %arg2, %6 -> %arg3 : !fir.ref<i32>, !fir.ref<f32>, !fir.ref<!fir.type<_QFderived_type{real:f32,array:!fir.array<10xi32>,int:i32}>>) {
@@ -1275,3 +1275,22 @@ func.func @map_nested_dtype_alloca_mem2(%arg0 : !fir.ref<!fir.type<_QFRecTy{i:f3
   }
   return
 }
+
+// -----
+
+// CHECK-LABEL:   omp.declare_mapper @my_mapper : !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)> {
+omp.declare_mapper @my_mapper : !fir.type<_QFdeclare_mapperTmy_type{data:i32}> {
+// CHECK:         ^bb0(%[[VAL_0:.*]]: !llvm.ptr):
+^bb0(%0: !fir.ref<!fir.type<_QFdeclare_mapperTmy_type{data:i32}>>):
+// CHECK:           %[[VAL_1:.*]] = llvm.mlir.constant(0 : i32) : i32
+  %1 = fir.field_index data, !fir.type<_QFdeclare_mapperTmy_type{data:i32}>
+// CHECK:           %[[VAL_2:.*]] = llvm.getelementptr %[[VAL_0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)>
+  %2 = fir.coordinate_of %0, %1 : (!fir.ref<!fir.type<_QFdeclare_mapperTmy_type{data:i32}>>, !fir.field) -> !fir.ref<i32>
+// CHECK:           %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var%[[VAL_4:.*]]"}
+  %3 = omp.map.info var_ptr(%2 : !fir.ref<i32>, i32) map_clauses(tofrom) capture(ByRef) -> !fir.ref<i32> {name = "var%data"}
+// CHECK:           %[[VAL_5:.*]] = omp.map.info var_ptr(%[[VAL_0]] : !llvm.ptr, !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) members(%[[VAL_3]] : [0] : !llvm.ptr) -> !llvm.ptr {name = "var", partial_map = true}
+  %4 = omp.map.info var_ptr(%0 : !fir.ref<!fir.type<_QFdeclare_mapperTmy_type{data:i32}>>, !fir.type<_QFdeclare_mapperTmy_type{data:i32}>) map_clauses(tofrom) capture(ByRef) members(%3 : [0] : !fir.ref<i32>) -> !fir.ref<!fir.type<_QFdeclare_mapperTmy_type{data:i32}>> {name = "var", partial_map = true}
+// CHECK:           omp.declare_mapper.info map_entries(%[[VAL_5]], %[[VAL_3]] : !llvm.ptr, !llvm.ptr)
+  omp.declare_mapper.info map_entries(%4, %3 : !fir.ref<!fir.type<_QFdeclare_mapperTmy_type{data:i32}>>, !fir.ref<i32>)
+// CHECK:         }
+}
diff --git a/flang/test/Lower/OpenMP/Todo/map-mapper.f90 b/flang/test/Lower/OpenMP/Todo/map-mapper.f90
deleted file mode 100644
index 9554ffd5fda7b..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/map-mapper.f90
+++ /dev/null
@@ -1,16 +0,0 @@
-! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 %s 2>&1 | FileCheck %s
-program p
-  integer, parameter :: n = 256
-  real(8) :: a(256)
-  !! TODO: Add declare mapper, when it works to lower this construct 
-  !!type t1
-  !!   integer :: x
-  !!end type t1
-  !!!$omp declare mapper(xx : t1 :: nn) map(nn, nn%x)
-  !$omp target map(mapper(xx), from:a)
-!CHECK: not yet implemented: Support for mapper modifiers is not implemented yet
-  do i=1,n
-     a(i) = 4.2
-  end do
-  !$omp end target
-end program p
diff --git a/flang/test/Lower/OpenMP/Todo/omp-declare-mapper.f90 b/flang/test/Lower/OpenMP/Todo/omp-declare-mapper.f90
deleted file mode 100644
index 5ae48ff736048..0000000000000
--- a/flang/test/Lower/OpenMP/Todo/omp-declare-mapper.f90
+++ /dev/null
@@ -1,47 +0,0 @@
-! This test checks lowering of OpenMP declare mapper Directive.
-
-! RUN: split-file %s %t
-! RUN: not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-1.f90 2>&1 | FileCheck %t/omp-declare-mapper-1.f90
-! RUN  not %flang_fc1 -emit-fir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-2.f90 2>&1 | FileCheck %t/omp-declare-mapper-2.f90
-
-!--- omp-declare-mapper-1.f90
-subroutine declare_mapper_1
- integer,parameter      :: nvals = 250
- type my_type
-   integer              :: num_vals
-   integer, allocatable :: values(:)
- end type 
-
- type my_type2
-   type (my_type)        :: my_type_var
-   type (my_type)        :: temp
-   real,dimension(nvals) :: unmapped
-   real,dimension(nvals) :: arr
-  end type
-  type (my_type2)        :: t
-  real                   :: x, y(nvals)
-  !$omp declare mapper (my_type :: var) map (var, var%values (1:var%num_vals))
-!CHECK: not yet implemented: OpenMPDeclareMapperConstruct
-end subroutine declare_mapper_1
-
-
-!--- omp-declare-mapper-2.f90
-subroutine declare_mapper_2
- integer,parameter      :: nvals = 250
- type my_type
-   integer              :: num_vals
-   integer, allocatable :: values(:)
- end type 
-
- type my_type2
-   type (my_type)        :: my_type_var
-   type (my_type)        :: temp
-   real,dimension(nvals) :: unmapped
-   real,dimension(nvals) :: arr
-  end type
-  type (my_type2)        :: t
-  real                   :: x, y(nvals)
-  !$omp declare mapper (my_mapper : my_type2 :: v) map (v%arr, x, y(:)) &
-  !$omp&                map (alloc : v%temp)
-!CHECK: not yet implemented: OpenMPDeclareMapperConstruct
-end subroutine declare_mapper_2
diff --git a/flang/test/Lower/OpenMP/declare-mapper.f90 b/flang/test/Lower/OpenMP/declare-mapper.f90
new file mode 100644
index 0000000000000..fa7f23c182a68
--- /dev/null
+++ b/flang/test/Lower/OpenMP/declare-mapper.f90
@@ -0,0 +1,145 @@
+! This test checks lowering of OpenMP declare mapper Directive.
+
+! RUN: split-file %s %t
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-1.f90 -o - | FileCheck %t/omp-declare-mapper-1.f90
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-2.f90 -o - | FileCheck %t/omp-declare-mapper-2.f90
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %t/omp-declare-mapper-3.f90 -o - | FileCheck %t/omp-declare-mapper-3.f90
+
+!--- omp-declare-mapper-1.f90
+subroutine declare_mapper_1
+   integer, parameter      :: nvals = 250
+   type my_type
+      integer              :: num_vals
+      integer, allocatable :: values(:)
+   end type
+
+   type my_type2
+      type(my_type)        :: my_type_var
+      type(my_type)        :: temp
+      real, dimension(nvals) :: unmapped
+      real, dimension(nvals) :: arr
+   end type
+   type(my_type2)        :: t
+   real                   :: x, y(nvals)
+   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_1my_type\.default]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_1Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>]] {
+   !CHECK:      ^bb0(%[[VAL_0:.*]]: !fir.ref<[[MY_TYPE]]>):
+   !CHECK:        %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdeclare_mapper_1Evar"} : (!fir.ref<[[MY_TYPE]]>) -> (!fir.ref<[[MY_TYPE]]>, !fir.ref<[[MY_TYPE]]>)
+   !CHECK:        %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0{"values"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+   !CHECK:        %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+   !CHECK:        %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+   !CHECK:        %[[VAL_5:.*]] = arith.constant 0 : index
+   !CHECK:        %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_3]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
+   !CHECK:        %[[VAL_7:.*]] = arith.constant 0 : index
+   !CHECK:        %[[VAL_8:.*]] = arith.constant 1 : index
+   !CHECK:        %[[VAL_9:.*]] = arith.constant 1 : index
+   !CHECK:        %[[VAL_10:.*]] = arith.subi %[[VAL_9]], %[[VAL_6]]#0 : index
+   !CHECK:        %[[VAL_11:.*]] = hlfir.designate %[[VAL_1]]#0{"num_vals"}   : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<i32>
+   !CHECK:        %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+   !CHECK:        %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
+   !CHECK:        %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i64) -> index
+   !CHECK:        %[[VAL_15:.*]] = arith.subi %[[VAL_14]], %[[VAL_6]]#0 : index
+   !CHECK:        %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_10]] : index) upper_bound(%[[VAL_15]] : index) extent(%[[VAL_6]]#1 : index) stride(%[[VAL_8]] : index) start_idx(%[[VAL_6]]#0 : index)
+   !CHECK:        %[[VAL_17:.*]] = arith.constant 1 : index
+   !CHECK:        %[[VAL_18:.*]] = fir.coordinate_of %[[VAL_1]]#0, %[[VAL_17]] : (!fir.ref<[[MY_TYPE]]>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+   !CHECK:        %[[VAL_19:.*]] = fir.box_offset %[[VAL_18]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+   !CHECK:        %[[VAL_20:.*]] = omp.map.info var_ptr(%[[VAL_18]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[VAL_19]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_16]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+   !CHECK:        %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_18]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "var%[[VAL_22:.*]](1:var%[[VAL_23:.*]])"}
+   !CHECK:        %[[VAL_24:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE]]>, [[MY_TYPE]]) map_clauses(tofrom) capture(ByRef) members(%[[VAL_21]], %[[VAL_20]] : [1], [1, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<[[MY_TYPE]]> {name = "var"}
+   !CHECK:        omp.declare_mapper.info map_entries(%[[VAL_24]], %[[VAL_21]], %[[VAL_20]] : !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)
+   !CHECK:      }
+   !$omp declare mapper (my_type :: var) map (var, var%values (1:var%num_vals))
+end subroutine declare_mapper_1
+
+!--- omp-declare-mapper-2.f90
+subroutine declare_mapper_2
+   integer, parameter      :: nvals = 250
+   type my_type
+      integer              :: num_vals
+      integer, allocatable :: values(:)
+   end type
+
+   type my_type2
+      type(my_type)        :: my_type_var
+      type(my_type)        :: temp
+      real, dimension(nvals) :: unmapped
+      real, dimension(nvals) :: arr
+   end type
+   type(my_type2)        :: t
+   real                  :: x, y(nvals)
+   !CHECK:omp.declare_mapper @[[MY_TYPE_MAPPER:_QQFdeclare_mapper_2my_mapper]] : [[MY_TYPE:!fir\.type<_QFdeclare_mapper_2Tmy_type2\{my_type_var:!fir\.type<_QFdeclare_mapper_2Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>,temp:!fir\.type<_QFdeclare_mapper_2Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>\}>,unmapped:!fir\.array<250xf32>,arr:!fir\.array<250xf32>\}>]] {
+   !CHECK:      ^bb0(%[[VAL_0:.*]]: !fir.ref<[[MY_TYPE]]>):
+   !CHECK:        %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdeclare_mapper_2Ev"} : (!fir.ref<[[MY_TYPE]]>) -> (!fir.ref<[[MY_TYPE]]>, !fir.ref<[[MY_TYPE]]>)
+   !CHECK:        %[[VAL_2:.*]] = arith.constant 250 : index
+   !CHECK:        %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
+   !CHECK:        %[[VAL_4:.*]] = hlfir.designate %[[VAL_1]]#0{"arr"}   shape %[[VAL_3]] : (!fir.ref<[[MY_TYPE]]>, !fir.shape<1>) -> !fir.ref<!fir.array<250xf32>>
+   !CHECK:        %[[VAL_5:.*]] = arith.constant 1 : index
+   !CHECK:        %[[VAL_6:.*]] = arith.constant 0 : index
+   !CHECK:        %[[VAL_7:.*]] = arith.subi %[[VAL_2]], %[[VAL_5]] : index
+   !CHECK:        %[[VAL_8:.*]] = omp.map.bounds lower_bound(%[[VAL_6]] : index) upper_bound(%[[VAL_7]] : index) extent(%[[VAL_2]] : index) stride(%[[VAL_5]] : index) start_idx(%[[VAL_5]] : index)
+   !CHECK:        %[[VAL_9:.*]] = omp.map.info var_ptr(%[[VAL_4]] : !fir.ref<!fir.array<250xf32>>, !fir.array<250xf32>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_8]]) -> !fir.ref<!fir.array<250xf32>> {name = "v%[[VAL_10:.*]]"}
+   !CHECK:        %[[VAL_11:.*]] = hlfir.designate %[[VAL_1]]#0{"temp"}   : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>
+   !CHECK:        %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_11]] : !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>, !fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>) map_clauses(exit_release_or_enter_alloc) capture(ByRef) -> !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>> {name = "v%[[VAL_13:.*]]"}
+   !CHECK:        %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE]]>, [[MY_TYPE]]) map_clauses(tofrom) capture(ByRef) members(%[[VAL_9]], %[[VAL_12]] : [3], [1] : !fir.ref<!fir.array<250xf32>>, !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>) -> !fir.ref<[[MY_TYPE]]> {name = "v", partial_map = true}
+   !CHECK:        omp.declare_mapper.info map_entries(%[[VAL_14]], %[[VAL_9]], %[[VAL_12]] : !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.array<250xf32>>, !fir.ref<!fir.type<_QFdeclare_mapper_2Tmy_type{num_vals:i32,values:!fir.box<!fir.heap<!fir.array<?xi32>>>}>>)
+   !CHECK:      }
+   !$omp declare mapper (my_mapper : my_type2 :: v) map (v%arr) map (alloc : v%temp)
+end subroutine declare_mapper_2
+
+!--- omp-declare-mapper-3.f90
+subroutine declare_mapper_3
+   type my_type
+      integer              :: num_vals
+      integer, allocatable :: values(:)
+   end type
+
+   type my_type2
+      type(my_type)        :: my_type_var
+      real, dimension(250) :: arr
+   end type
+
+   !CHECK:  omp.declare_mapper @[[MY_TYPE_MAPPER2:_QQFdeclare_mapper_3my_mapper2]] : [[MY_TYPE2:!fir\.type<_QFdeclare_mapper_3Tmy_type2\{my_type_var:!fir\.type<_QFdeclare_mapper_3Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>}>,arr:!fir\.array<250xf32>}>]] {
+   !CHECK:   ^bb0(%[[VAL_0:.*]]: !fir.ref<[[MY_TYPE2]]>):
+   !CHECK:     %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdeclare_mapper_3Ev"} : (!fir.ref<[[MY_TYPE2]]>) -> (!fir.ref<[[MY_TYPE2]]>, !fir.ref<[[MY_TYPE2]]>)
+   !CHECK:     %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0{"my_type_var"}   : (!fir.ref<[[MY_TYPE2]]>) -> !fir.ref<[[MY_TYPE:!fir\.type<_QFdeclare_mapper_3Tmy_type\{num_vals:i32,values:!fir\.box<!fir\.heap<!fir\.array<\?xi32>>>}>]]>
+   !CHECK:     %[[VAL_3:.*]] = omp.map.info var_ptr(%[[VAL_2]] : !fir.ref<[[MY_TYPE]]>, [[MY_TYPE]]) mapper(@[[MY_TYPE_MAPPER:_QQFdeclare_mapper_3my_mapper]]) map_clauses(tofrom) capture(ByRef) -> !fir.ref<[[MY_TYPE]]> {name = "v%[[VAL_4:.*]]"}
+   !CHECK:     %[[VAL_5:.*]] = arith.constant 250 : index
+   !CHECK:     %[[VAL_6:.*]] = fir.shape %[[VAL_5]] : (index) -> !fir.shape<1>
+   !CHECK:     %[[VAL_7:.*]] = hlfir.designate %[[VAL_1]]#0{"arr"}   shape %[[VAL_6]] : (!fir.ref<[[MY_TYPE2]]>, !fir.shape<1>) -> !fir.ref<!fir.array<250xf32>>
+   !CHECK:     %[[VAL_8:.*]] = arith.constant 1 : index
+   !CHECK:     %[[VAL_9:.*]] = arith.constant 0 : index
+   !CHECK:     %[[VAL_10:.*]] = arith.subi %[[VAL_5]], %[[VAL_8]] : index
+   !CHECK:     %[[VAL_11:.*]] = omp.map.bounds lower_bound(%[[VAL_9]] : index) upper_bound(%[[VAL_10]] : index) extent(%[[VAL_5]] : index) stride(%[[VAL_8]] : index) start_idx(%[[VAL_8]] : index)
+   !CHECK:     %[[VAL_12:.*]] = omp.map.info var_ptr(%[[VAL_7]] : !fir.ref<!fir.array<250xf32>>, !fir.array<250xf32>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_11]]) -> !fir.ref<!fir.array<250xf32>> {name = "v%[[VAL_13:.*]]"}
+   !CHECK:     %[[VAL_14:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE2]]>, [[MY_TYPE2]]) map_clauses(tofrom) capture(ByRef) members(%[[VAL_3]], %[[VAL_12]] : [0], [1] : !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.array<250xf32>>) -> !fir.ref<[[MY_TYPE2]]> {name = "v", partial_map = true}
+   !CHECK:     omp.declare_mapper.info map_entries(%[[VAL_14]], %[[VAL_3]], %[[VAL_12]] : !fir.ref<[[MY_TYPE2]]>, !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.array<250xf32>>)
+   !CHECK:  }
+
+   !CHECK:  omp.declare_mapper @[[MY_TYPE_MAPPER]] : [[MY_TYPE]] {
+   !CHECK:   ^bb0(%[[VAL_0:.*]]: !fir.ref<[[MY_TYPE]]>):
+   !CHECK:     %[[VAL_1:.*]]:2 = hlfir.declare %[[VAL_0]] {uniq_name = "_QFdeclare_mapper_3Evar"} : (!fir.ref<[[MY_TYPE]]>) -> (!fir.ref<[[MY_TYPE]]>, !fir.ref<[[MY_TYPE]]>)
+   !CHECK:     %[[VAL_2:.*]] = hlfir.designate %[[VAL_1]]#0{"values"}   {fortran_attrs = #fir.var_attrs<allocatable>} : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+   !CHECK:     %[[VAL_3:.*]] = fir.load %[[VAL_2]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+   !CHECK:     %[[VAL_4:.*]] = fir.box_addr %[[VAL_3]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
+   !CHECK:     %[[VAL_5:.*]] = arith.constant 0 : index
+   !CHECK:     %[[VAL_6:.*]]:3 = fir.box_dims %[[VAL_3]], %[[VAL_5]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>, index) -> (index, index, index)
+   !CHECK:     %[[VAL_7:.*]] = arith.constant 0 : index
+   !CHECK:     %[[VAL_8:.*]] = arith.constant 1 : index
+   !CHECK:     %[[VAL_9:.*]] = arith.constant 1 : index
+   !CHECK:     %[[VAL_10:.*]] = arith.subi %[[VAL_9]], %[[VAL_6]]#0 : index
+   !CHECK:     %[[VAL_11:.*]] = hlfir.designate %[[VAL_1]]#0{"num_vals"}   : (!fir.ref<[[MY_TYPE]]>) -> !fir.ref<i32>
+   !CHECK:     %[[VAL_12:.*]] = fir.load %[[VAL_11]] : !fir.ref<i32>
+   !CHECK:     %[[VAL_13:.*]] = fir.convert %[[VAL_12]] : (i32) -> i64
+   !CHECK:     %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (i64) -> index
+   !CHECK:     %[[VAL_15:.*]] = arith.subi %[[VAL_14]], %[[VAL_6]]#0 : index
+   !CHECK:     %[[VAL_16:.*]] = omp.map.bounds lower_bound(%[[VAL_10]] : index) upper_bound(%[[VAL_15]] : index) extent(%[[VAL_6]]#1 : index) stride(%[[VAL_8]] : index) start_idx(%[[VAL_6]]#0 : index)
+   !CHECK:     %[[VAL_17:.*]] = arith.constant 1 : index
+   !CHECK:     %[[VAL_18:.*]] = fir.coordinate_of %[[VAL_1]]#0, %[[VAL_17]] : (!fir.ref<[[MY_TYPE]]>, index) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
+   !CHECK:     %[[VAL_19:.*]] = fir.box_offset %[[VAL_18]] base_addr : (!fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>
+   !CHECK:     %[[VAL_20:.*]] = omp.map.info var_ptr(%[[VAL_18]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, i32) var_ptr_ptr(%[[VAL_19]] : !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) map_clauses(tofrom) capture(ByRef) bounds(%[[VAL_16]]) -> !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>> {name = ""}
+   !CHECK:     %[[VAL_21:.*]] = omp.map.info var_ptr(%[[VAL_18]] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.box<!fir.heap<!fir.array<?xi32>>>) map_clauses(to) capture(ByRef) -> !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>> {name = "var%[[VAL_22:.*]](1:var%[[VAL_23:.*]])"}
+   !CHECK:     %[[VAL_24:.*]] = omp.map.info var_ptr(%[[VAL_1]]#1 : !fir.ref<[[MY_TYPE]]>, [[MY_TYPE]]) map_clauses(tofrom) capture(ByRef) members(%[[VAL_21]], %[[VAL_20]] : [1], [1, 0] : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>) -> !fir.ref<[[MY_TYPE]]> {name = "var"}
+   !CHECK:     omp.declare_mapper.info map_entries(%[[VAL_24]], %[[VAL_21]], %[[VAL_20]] : !fir.ref<[[MY_TYPE]]>, !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>, !fir.llvm_ptr<!fir.ref<!fir.array<?xi32>>>)
+   !CHECK:  }
+   !$omp declare mapper (my_mapper : my_type :: var) map (var, var%values (1:var%num_vals))
+   !$omp declare mapper (my_mapper2 : my_type2 :: v) map (mapper(my_mapper) : v%my_type_var) map (tofrom : v%arr)
+end subroutine declare_mapper_3
diff --git a/flang/test/Lower/OpenMP/map-mapper.f90 b/flang/test/Lower/OpenMP/map-mapper.f90
new file mode 100644
index 0000000000000..0d8fe7344bfab
--- /dev/null
+++ b/flang/test/Lower/OpenMP/map-mapper.f90
@@ -0,0 +1,30 @@
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
+program p
+   integer, parameter :: n = 256
+   type t1
+      integer :: x(256)
+   end type t1
+
+   !$omp declare mapper(xx : t1 :: nn) map(to: nn, nn%x)
+   !$omp declare mapper(t1 :: nn) map(from: nn)
+
+   !CHECK-LABEL: omp.declare_mapper @_QQFt1.default : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
+   !CHECK-LABEL: omp.declare_mapper @_QQFxx : !fir.type<_QFTt1{x:!fir.array<256xi32>}>
+
+   type(t1) :: a, b
+   !CHECK: %[[MAP_A:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) mapper(@_QQFxx) map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "a"}
+   !CHECK: omp.target map_entries(%[[MAP_A]] -> %{{.*}}, %{{.*}} -> %{{.*}} : {{.*}}, {{.*}}) {
+   !$omp target map(mapper(xx) : a)
+   do i = 1, n
+      a%x(i) = i
+   end do
+   !$omp end target
+
+   !CHECK: %[[MAP_B:.*]] = omp.map.info var_ptr(%{{.*}} : {{.*}}, {{.*}}) mapper(@_QQFt1.default) map_clauses(tofrom) capture(ByRef) -> {{.*}} {name = "b"}
+   !CHECK: omp.target map_entries(%[[MAP_B]] -> %{{.*}}, %{{.*}} -> %{{.*}} : {{.*}}, {{.*}}) {
+   !$omp target map(mapper(default) : b)
+   do i = 1, n
+      b%x(i) = i
+   end do
+   !$omp end target
+end program p
diff --git a/libclc/clc/include/clc/clc_convert.h b/libclc/clc/include/clc/clc_convert.h
index 20bbd57540b30..12cd988d59c54 100644
--- a/libclc/clc/include/clc/clc_convert.h
+++ b/libclc/clc/include/clc/clc_convert.h
@@ -1,6 +1,8 @@
 #ifndef __CLC_CLC_CONVERT_H__
 #define __CLC_CLC_CONVERT_H__
 
+#include <clc/clcmacro.h>
+
 #define _CLC_CONVERT_DECL(FROM_TYPE, TO_TYPE, SUFFIX)                          \
   _CLC_OVERLOAD _CLC_DECL TO_TYPE __clc_convert_##TO_TYPE##SUFFIX(FROM_TYPE x);
 
diff --git a/libclc/clc/lib/generic/integer/clc_mad_sat.cl b/libclc/clc/lib/generic/integer/clc_mad_sat.cl
index 4e559dba2b2f5..530e9c84b10a0 100644
--- a/libclc/clc/lib/generic/integer/clc_mad_sat.cl
+++ b/libclc/clc/lib/generic/integer/clc_mad_sat.cl
@@ -1,3 +1,4 @@
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_add_sat.h>
 #include <clc/integer/clc_mad24.h>
@@ -8,34 +9,23 @@
 #include <clc/relational/clc_select.h>
 #include <clc/shared/clc_clamp.h>
 
-#define __CLC_CONVERT_TY(X, TY) __builtin_convertvector(X, TY)
-
-// Macro for defining mad_sat variants for char/uchar/short/ushort
-// FIXME: Once using __clc_convert_ty, can easily unify scalar and vector defs
 #define __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE, UP_TYPE, LIT_PREFIX)                 \
   _CLC_OVERLOAD _CLC_DEF TYPE __clc_mad_sat(TYPE x, TYPE y, TYPE z) {          \
-    return __clc_clamp(                                                        \
-        (UP_TYPE)__clc_mad24((UP_TYPE)x, (UP_TYPE)y, (UP_TYPE)z),              \
-        (UP_TYPE)LIT_PREFIX##_MIN, (UP_TYPE)LIT_PREFIX##_MAX);                 \
-  }
-
-#define __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE, UP_TYPE, LIT_PREFIX)             \
-  _CLC_OVERLOAD _CLC_DEF TYPE __clc_mad_sat(TYPE x, TYPE y, TYPE z) {          \
-    UP_TYPE upscaled_mad = __clc_mad24(__CLC_CONVERT_TY(x, UP_TYPE),           \
-                                       __CLC_CONVERT_TY(y, UP_TYPE),           \
-                                       __CLC_CONVERT_TY(z, UP_TYPE));          \
+    UP_TYPE upscaled_mad =                                                     \
+        __clc_mad24(__clc_convert_##UP_TYPE(x), __clc_convert_##UP_TYPE(y),    \
+                    __clc_convert_##UP_TYPE(z));                               \
     UP_TYPE clamped_mad = __clc_clamp(upscaled_mad, (UP_TYPE)LIT_PREFIX##_MIN, \
                                       (UP_TYPE)LIT_PREFIX##_MAX);              \
-    return __CLC_CONVERT_TY(clamped_mad, TYPE);                                \
+    return __clc_convert_##TYPE(clamped_mad);                                  \
   }
 
 #define __CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(TYPE, UP_TYPE, LIT_PREFIX)         \
   __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE, UP_TYPE, LIT_PREFIX)                       \
-  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##2, UP_TYPE##2, LIT_PREFIX)             \
-  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##3, UP_TYPE##3, LIT_PREFIX)             \
-  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##4, UP_TYPE##4, LIT_PREFIX)             \
-  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##8, UP_TYPE##8, LIT_PREFIX)             \
-  __CLC_DEFINE_SIMPLE_MAD_SAT_VEC(TYPE##16, UP_TYPE##16, LIT_PREFIX)
+  __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE##2, UP_TYPE##2, LIT_PREFIX)                 \
+  __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE##3, UP_TYPE##3, LIT_PREFIX)                 \
+  __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE##4, UP_TYPE##4, LIT_PREFIX)                 \
+  __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE##8, UP_TYPE##8, LIT_PREFIX)                 \
+  __CLC_DEFINE_SIMPLE_MAD_SAT(TYPE##16, UP_TYPE##16, LIT_PREFIX)
 
 __CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(char, int, CHAR)
 __CLC_DEFINE_SIMPLE_MAD_SAT_ALL_TYS(uchar, uint, UCHAR)
@@ -67,20 +57,13 @@ __CLC_DEFINE_UINTLONG_MAD_SAT_ALL_TYS(ulong, long, ULONG)
     INTTY mhi = __clc_mul_hi(x, y);                                            \
     UINTTY mlo = __clc_as_##UINTTY(x * y);                                     \
     SLONGTY m = __clc_upsample(mhi, mlo);                                      \
-    m += __CLC_CONVERT_TY(z, SLONGTY);                                         \
+    m += __clc_convert_##SLONGTY(z);                                           \
     m = __clc_clamp(m, (SLONGTY)INT_MIN, (SLONGTY)INT_MAX);                    \
-    return __CLC_CONVERT_TY(m, INTTY);                                         \
+    return __clc_convert_##INTTY(m);                                           \
   }
 
-// FIXME: Once using __clc_convert_ty, can easily unify scalar and vector defs
 #define __CLC_DEFINE_SINT_MAD_SAT_ALL_TYS(INTTY, UINTTY, SLONGTY)              \
-  _CLC_OVERLOAD _CLC_DEF INTTY __clc_mad_sat(INTTY x, INTTY y, INTTY z) {      \
-    INTTY mhi = __clc_mul_hi(x, y);                                            \
-    UINTTY mlo = __clc_as_##UINTTY(x * y);                                     \
-    SLONGTY m = __clc_upsample(mhi, mlo);                                      \
-    m += z;                                                                    \
-    return __clc_clamp(m, (SLONGTY)INT_MIN, (SLONGTY)INT_MAX);                 \
-  }                                                                            \
+  __CLC_DEFINE_SINT_MAD_SAT(INTTY, UINTTY, SLONGTY)                            \
   __CLC_DEFINE_SINT_MAD_SAT(INTTY##2, UINTTY##2, SLONGTY##2)                   \
   __CLC_DEFINE_SINT_MAD_SAT(INTTY##3, UINTTY##3, SLONGTY##3)                   \
   __CLC_DEFINE_SINT_MAD_SAT(INTTY##4, UINTTY##4, SLONGTY##4)                   \
diff --git a/libclc/clc/lib/generic/integer/clc_mul_hi.cl b/libclc/clc/lib/generic/integer/clc_mul_hi.cl
index cf4acc5429cb4..28457ac6126dd 100644
--- a/libclc/clc/lib/generic/integer/clc_mul_hi.cl
+++ b/libclc/clc/lib/generic/integer/clc_mul_hi.cl
@@ -1,31 +1,24 @@
+#include <clc/clc_convert.h>
 #include <clc/integer/clc_hadd.h>
 #include <clc/integer/definitions.h>
 #include <clc/internal/clc.h>
 
-// TODO: Replace with __clc_convert_<type> when available
-#define __CLC_CONVERT_TY(X, TY) __builtin_convertvector(X, TY)
-
-#define __CLC_MUL_HI_VEC_IMPL(BGENTYPE, GENTYPE, GENSIZE)                      \
-  _CLC_OVERLOAD _CLC_DEF GENTYPE __clc_mul_hi(GENTYPE x, GENTYPE y) {          \
-    BGENTYPE large_x = __CLC_CONVERT_TY(x, BGENTYPE);                          \
-    BGENTYPE large_y = __CLC_CONVERT_TY(y, BGENTYPE);                          \
-    BGENTYPE large_mul_hi = (large_x * large_y) >> (BGENTYPE)GENSIZE;          \
-    return __CLC_CONVERT_TY(large_mul_hi, GENTYPE);                            \
-  }
-
 // For all types EXCEPT long, which is implemented separately
 #define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE)                          \
   _CLC_OVERLOAD _CLC_DEF GENTYPE __clc_mul_hi(GENTYPE x, GENTYPE y) {          \
-    return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE);                  \
+    BGENTYPE large_x = __clc_convert_##BGENTYPE(x);                            \
+    BGENTYPE large_y = __clc_convert_##BGENTYPE(y);                            \
+    BGENTYPE large_mul_hi = (large_x * large_y) >> (BGENTYPE)GENSIZE;          \
+    return __clc_convert_##GENTYPE(large_mul_hi);                              \
   }
 
 #define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS)                               \
   __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS)                                         \
-  __CLC_MUL_HI_VEC_IMPL(BTYPE##2, TYPE##2, BITS)                               \
-  __CLC_MUL_HI_VEC_IMPL(BTYPE##3, TYPE##3, BITS)                               \
-  __CLC_MUL_HI_VEC_IMPL(BTYPE##4, TYPE##4, BITS)                               \
-  __CLC_MUL_HI_VEC_IMPL(BTYPE##8, TYPE##8, BITS)                               \
-  __CLC_MUL_HI_VEC_IMPL(BTYPE##16, TYPE##16, BITS)
+  __CLC_MUL_HI_IMPL(BTYPE##2, TYPE##2, BITS)                                   \
+  __CLC_MUL_HI_IMPL(BTYPE##3, TYPE##3, BITS)                                   \
+  __CLC_MUL_HI_IMPL(BTYPE##4, TYPE##4, BITS)                                   \
+  __CLC_MUL_HI_IMPL(BTYPE##8, TYPE##8, BITS)                                   \
+  __CLC_MUL_HI_IMPL(BTYPE##16, TYPE##16, BITS)
 
 _CLC_OVERLOAD _CLC_DEF long __clc_mul_hi(long x, long y) {
   long f, o, i;
@@ -98,8 +91,8 @@ _CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
     f = x_hi * y_hi;                                                           \
     o = x_hi * y_lo;                                                           \
     i = x_lo * y_hi;                                                           \
-    l = __CLC_CONVERT_TY(x_lo * y_lo, UTY);                                    \
-    i += __CLC_CONVERT_TY(l >> (UTY)32, TY);                                   \
+    l = __clc_convert_##UTY(x_lo * y_lo);                                      \
+    i += __clc_convert_##TY(l >> (UTY)32);                                     \
                                                                                \
     return f + (__clc_hadd(o, i) >> (TY)31);                                   \
   }
@@ -128,5 +121,3 @@ __CLC_MUL_HI_TYPES()
 #undef __CLC_MUL_HI_LONG_VEC_IMPL
 #undef __CLC_MUL_HI_DEC_IMPL
 #undef __CLC_MUL_HI_IMPL
-#undef __CLC_MUL_HI_VEC_IMPL
-#undef __CLC_CONVERT_TY
diff --git a/libclc/clc/lib/generic/integer/clc_upsample.cl b/libclc/clc/lib/generic/integer/clc_upsample.cl
index d53ef7240bfc2..b8f884dc9f63c 100644
--- a/libclc/clc/lib/generic/integer/clc_upsample.cl
+++ b/libclc/clc/lib/generic/integer/clc_upsample.cl
@@ -1,35 +1,31 @@
+#include <clc/clc_convert.h>
 #include <clc/internal/clc.h>
 
-// TODO: Replace with __clc_convert_<type> when available
-#define __CLC_CONVERT_TY(X, TY) __builtin_convertvector(X, TY)
-
-#define __CLC_UPSAMPLE_VEC_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE)          \
+#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE)              \
   _CLC_OVERLOAD _CLC_DEF BGENTYPE __clc_upsample(GENTYPE hi, UGENTYPE lo) {    \
-    BGENTYPE large_hi = __CLC_CONVERT_TY(hi, BGENTYPE);                        \
-    BGENTYPE large_lo = __CLC_CONVERT_TY(lo, BGENTYPE);                        \
+    BGENTYPE large_hi = __clc_convert_##BGENTYPE(hi);                          \
+    BGENTYPE large_lo = __clc_convert_##BGENTYPE(lo);                          \
     return (large_hi << (BGENTYPE)GENSIZE) | large_lo;                         \
   }
 
-#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE)              \
-  _CLC_OVERLOAD _CLC_DEF BGENTYPE __clc_upsample(GENTYPE hi, UGENTYPE lo) {    \
-    return ((BGENTYPE)hi << GENSIZE) | lo;                                     \
-  }                                                                            \
-  __CLC_UPSAMPLE_VEC_IMPL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2, GENSIZE)       \
-  __CLC_UPSAMPLE_VEC_IMPL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3, GENSIZE)       \
-  __CLC_UPSAMPLE_VEC_IMPL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4, GENSIZE)       \
-  __CLC_UPSAMPLE_VEC_IMPL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8, GENSIZE)       \
-  __CLC_UPSAMPLE_VEC_IMPL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16, GENSIZE)
+#define __CLC_UPSAMPLE_IMPL_ALL_TYS(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE)      \
+  __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE)                    \
+  __CLC_UPSAMPLE_IMPL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2, GENSIZE)           \
+  __CLC_UPSAMPLE_IMPL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3, GENSIZE)           \
+  __CLC_UPSAMPLE_IMPL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4, GENSIZE)           \
+  __CLC_UPSAMPLE_IMPL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8, GENSIZE)           \
+  __CLC_UPSAMPLE_IMPL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16, GENSIZE)
 
 #define __CLC_UPSAMPLE_TYPES()                                                 \
-  __CLC_UPSAMPLE_IMPL(short, char, uchar, 8)                                   \
-  __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8)                                 \
-  __CLC_UPSAMPLE_IMPL(int, short, ushort, 16)                                  \
-  __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16)                                \
-  __CLC_UPSAMPLE_IMPL(long, int, uint, 32)                                     \
-  __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32)
+  __CLC_UPSAMPLE_IMPL_ALL_TYS(short, char, uchar, 8)                           \
+  __CLC_UPSAMPLE_IMPL_ALL_TYS(ushort, uchar, uchar, 8)                         \
+  __CLC_UPSAMPLE_IMPL_ALL_TYS(int, short, ushort, 16)                          \
+  __CLC_UPSAMPLE_IMPL_ALL_TYS(uint, ushort, ushort, 16)                        \
+  __CLC_UPSAMPLE_IMPL_ALL_TYS(long, int, uint, 32)                             \
+  __CLC_UPSAMPLE_IMPL_ALL_TYS(ulong, uint, uint, 32)
 
 __CLC_UPSAMPLE_TYPES()
 
 #undef __CLC_UPSAMPLE_TYPES
+#undef __CLC_UPSAMPLE_IMPL_ALL_TYS
 #undef __CLC_UPSAMPLE_IMPL
-#undef __CLC_CONVERT_TY
diff --git a/libclc/generic/lib/math/clc_exp10.cl b/libclc/generic/lib/math/clc_exp10.cl
index 0eb53d013a85a..4f839a9815ac0 100644
--- a/libclc/generic/lib/math/clc_exp10.cl
+++ b/libclc/generic/lib/math/clc_exp10.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_mad.h>
 #include <clc/math/clc_subnormal_config.h>
@@ -70,7 +71,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
   int return_inf = x > X_MAX;
   int return_zero = x < X_MIN;
 
-  int n = convert_int(x * R_64_BY_LOG10_2);
+  int n = __clc_convert_int(x * R_64_BY_LOG10_2);
 
   float fn = (float)n;
   int j = n & 0x3f;
@@ -89,11 +90,11 @@ _CLC_DEF _CLC_OVERLOAD float __clc_exp10(float x) {
   float two_to_jby64 = USE_TABLE(exp_tbl, j);
   z2 = __clc_mad(two_to_jby64, z2, two_to_jby64);
 
-  float z2s = z2 * as_float(0x1 << (m + 149));
-  float z2n = as_float(as_int(z2) + m2);
+  float z2s = z2 * __clc_as_float(0x1 << (m + 149));
+  float z2n = __clc_as_float(__clc_as_int(z2) + m2);
   z2 = m <= -126 ? z2s : z2n;
 
-  z2 = return_inf ? as_float(PINFBITPATT_SP32) : z2;
+  z2 = return_inf ? __clc_as_float(PINFBITPATT_SP32) : z2;
   z2 = return_zero ? 0.0f : z2;
   z2 = return_nan ? x : z2;
   return z2;
@@ -115,7 +116,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
   // ln(10)
   const double R_LN10 = 0x1.26bb1bbb55516p+1;
 
-  int n = convert_int(x * R_64_BY_LOG10_2);
+  int n = __clc_convert_int(x * R_64_BY_LOG10_2);
 
   double dn = (double)n;
 
@@ -144,15 +145,15 @@ _CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x) {
 
   int n1 = m >> 2;
   int n2 = m - n1;
-  double z3 = z2 * as_double(((long)n1 + 1023) << 52);
-  z3 *= as_double(((long)n2 + 1023) << 52);
+  double z3 = z2 * __clc_as_double(((long)n1 + 1023) << 52);
+  z3 *= __clc_as_double(((long)n2 + 1023) << 52);
 
   z2 = ldexp(z2, m);
   z2 = small_value ? z3 : z2;
 
   z2 = __clc_isnan(x) ? x : z2;
 
-  z2 = x > X_MAX ? as_double(PINFBITPATT_DP64) : z2;
+  z2 = x > X_MAX ? __clc_as_double(PINFBITPATT_DP64) : z2;
   z2 = x < X_MIN ? 0.0 : z2;
 
   return z2;
diff --git a/libclc/generic/lib/math/clc_fmod.cl b/libclc/generic/lib/math/clc_fmod.cl
index a4a2ab791df68..31a5d4dc05c03 100644
--- a/libclc/generic/lib/math/clc_fmod.cl
+++ b/libclc/generic/lib/math/clc_fmod.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
@@ -31,19 +32,19 @@
 #include <math/clc_remainder.h>
 
 _CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) {
-  int ux = as_int(x);
+  int ux = __clc_as_int(x);
   int ax = ux & EXSIGNBIT_SP32;
-  float xa = as_float(ax);
+  float xa = __clc_as_float(ax);
   int sx = ux ^ ax;
   int ex = ax >> EXPSHIFTBITS_SP32;
 
-  int uy = as_int(y);
+  int uy = __clc_as_int(y);
   int ay = uy & EXSIGNBIT_SP32;
-  float ya = as_float(ay);
+  float ya = __clc_as_float(ay);
   int ey = ay >> EXPSHIFTBITS_SP32;
 
-  float xr = as_float(0x3f800000 | (ax & 0x007fffff));
-  float yr = as_float(0x3f800000 | (ay & 0x007fffff));
+  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
+  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
   int c;
   int k = ex - ey;
 
@@ -62,17 +63,17 @@ _CLC_DEF _CLC_OVERLOAD float __clc_fmod(float x, float y) {
   xr = lt ? xa : xr;
   yr = lt ? ya : yr;
 
-  float s = as_float(ey << EXPSHIFTBITS_SP32);
+  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
   xr *= lt ? 1.0f : s;
 
   c = ax == ay;
   xr = c ? 0.0f : xr;
 
-  xr = as_float(sx ^ as_int(xr));
+  xr = __clc_as_float(sx ^ __clc_as_int(xr));
 
   c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
       ay == 0;
-  xr = c ? as_float(QNANBITPATT_SP32) : xr;
+  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
 
   return xr;
 }
@@ -80,18 +81,18 @@ _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_fmod, float, float);
 
 #ifdef cl_khr_fp64
 _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
-  ulong ux = as_ulong(x);
+  ulong ux = __clc_as_ulong(x);
   ulong ax = ux & ~SIGNBIT_DP64;
   ulong xsgn = ux ^ ax;
-  double dx = as_double(ax);
-  int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
+  double dx = __clc_as_double(ax);
+  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
   int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
   xexp1 = xexp < 1 ? xexp1 : xexp;
 
-  ulong uy = as_ulong(y);
+  ulong uy = __clc_as_ulong(y);
   ulong ay = uy & ~SIGNBIT_DP64;
-  double dy = as_double(ay);
-  int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
+  double dy = __clc_as_double(ay);
+  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
   int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
   yexp1 = yexp < 1 ? yexp1 : yexp;
 
@@ -151,12 +152,12 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
   dx += i ? w : 0.0;
 
   // At this point, dx lies in the range [0,dy)
-  double ret = as_double(xsgn ^ as_ulong(dx));
-  dx = as_double(ax);
+  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
+  dx = __clc_as_double(ax);
 
   // Now handle |x| == |y|
   int c = dx == dy;
-  t = as_double(xsgn);
+  t = __clc_as_double(xsgn);
   ret = c ? t : ret;
 
   // Next, handle |x| < |y|
@@ -167,7 +168,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
 
   // |y| is 0
   c = dy == 0.0;
-  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
 
   // y is +-Inf, NaN
   c = yexp > BIASEDEMAX_DP64;
@@ -176,7 +177,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
 
   // x is +=Inf, NaN
   c = xexp > BIASEDEMAX_DP64;
-  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
 
   return ret;
 }
diff --git a/libclc/generic/lib/math/clc_pow.cl b/libclc/generic/lib/math/clc_pow.cl
index 5dcd392c0f7ed..fce9573c39bac 100644
--- a/libclc/generic/lib/math/clc_pow.cl
+++ b/libclc/generic/lib/math/clc_pow.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
@@ -68,18 +69,18 @@
 
 _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
 
-  int ix = as_int(x);
+  int ix = __clc_as_int(x);
   int ax = ix & EXSIGNBIT_SP32;
   int xpos = ix == ax;
 
-  int iy = as_int(y);
+  int iy = __clc_as_int(y);
   int ay = iy & EXSIGNBIT_SP32;
   int ypos = iy == ay;
 
   /* Extra precise log calculation
    *  First handle case that x is close to 1
    */
-  float r = 1.0f - as_float(ax);
+  float r = 1.0f - __clc_as_float(ax);
   int near1 = __clc_fabs(r) < 0x1.0p-4f;
   float r2 = r * r;
 
@@ -103,7 +104,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
   /* Computations for x not near 1 */
   int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
   float mf = (float)m;
-  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  int ixs = __clc_as_int(__clc_as_float(ax | 0x3f800000) - 1.0f);
   float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
   int c = m == -127;
   int ixn = c ? ixs : ax;
@@ -112,8 +113,8 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
   int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
 
   /* F - Y */
-  float f = as_float(0x3f000000 | indx) -
-            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+  float f = __clc_as_float(0x3f000000 | indx) -
+            __clc_as_float(0x3f000000 | (ixn & MANTBITS_SP32));
 
   indx = indx >> 16;
   float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
@@ -141,10 +142,10 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
   lh = near1 ? lh_near1 : lh;
   l = near1 ? l_near1 : l;
 
-  float gh = as_float(as_int(l) & 0xfffff000);
+  float gh = __clc_as_float(__clc_as_int(l) & 0xfffff000);
   float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
 
-  float yh = as_float(iy & 0xfffff000);
+  float yh = __clc_as_float(iy & 0xfffff000);
 
   float yt = y - yh;
 
@@ -155,7 +156,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
   /* Extra precise exp of ylogx */
   /* 64/log2 : 92.332482616893657 */
   const float R_64_BY_LOG2 = 0x1.715476p+6f;
-  int n = convert_int(ylogx * R_64_BY_LOG2);
+  int n = __clc_convert_int(ylogx * R_64_BY_LOG2);
   float nf = (float)n;
 
   int j = n & 0x3f;
@@ -178,14 +179,14 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
 
   float expylogx =
       __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
-  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-  float texpylogx = as_float(as_int(expylogx) + m2);
+  float sexpylogx = expylogx * __clc_as_float(0x1 << (m + 149));
+  float texpylogx = __clc_as_float(__clc_as_int(expylogx) + m2);
   expylogx = m < -125 ? sexpylogx : texpylogx;
 
   /* Result is +-Inf if (ylogx + ylogx_t) > 128*log2 */
   expylogx = (ylogx > 0x1.62e430p+6f) |
                      (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f)
-                 ? as_float(PINFBITPATT_SP32)
+                 ? __clc_as_float(PINFBITPATT_SP32)
                  : expylogx;
 
   /* Result is 0 if ylogx < -149*log2 */
@@ -205,9 +206,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
   inty = yexp < 1 ? 0 : inty;
   inty = yexp > 24 ? 2 : inty;
 
-  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  float signval = __clc_as_float((__clc_as_uint(expylogx) ^ SIGNBIT_SP32));
   expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-  int ret = as_int(expylogx);
+  int ret = __clc_as_int(expylogx);
 
   /* Corner case handling */
   ret = (!xpos & (inty == 0)) ? QNANBITPATT_SP32 : ret;
@@ -236,7 +237,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pow(float x, float y) {
   ret = ay == 0 ? 0x3f800000 : ret;
   ret = ix == 0x3f800000 ? 0x3f800000 : ret;
 
-  return as_float(ret);
+  return __clc_as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pow, float, float)
 
@@ -245,11 +246,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
   const double real_log2_tail = 5.76999904754328540596e-08;
   const double real_log2_lead = 6.93147122859954833984e-01;
 
-  long ux = as_long(x);
+  long ux = __clc_as_long(x);
   long ax = ux & (~SIGNBIT_DP64);
   int xpos = ax == ux;
 
-  long uy = as_long(y);
+  long uy = __clc_as_long(y);
   long ay = uy & (~SIGNBIT_DP64);
   int ypos = ay == uy;
 
@@ -261,7 +262,8 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
     double xexp = (double)exp;
     long mantissa = ax & 0x000FFFFFFFFFFFFFL;
 
-    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    long temp_ux =
+        __clc_as_long(__clc_as_double(0x3ff0000000000000L | mantissa) - 1.0);
     exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
     double xexp1 = (double)exp;
     long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
@@ -273,14 +275,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
                ((mantissa & 0x0000080000000000) << 1);
     int index = rax >> 44;
 
-    double F = as_double(rax | 0x3FE0000000000000L);
-    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double F = __clc_as_double(rax | 0x3FE0000000000000L);
+    double Y = __clc_as_double(mantissa | 0x3FE0000000000000L);
     double f = F - Y;
     double2 tv = USE_TABLE(log_f_inv_tbl, index);
     double log_h = tv.s0;
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
-    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
     double r2 = fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
@@ -304,11 +306,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
     double resT_h = poly0h;
 
     double H = resT + resH;
-    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double H_h = __clc_as_double(__clc_as_long(H) & 0xfffffffff8000000L);
     double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
     H = H_h;
 
-    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
     double y_tail = y - y_head;
 
     double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
@@ -354,7 +356,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
     expv = fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
-    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
     expv = v < min_exp_arg ? 0.0 : expv;
   }
 
@@ -376,7 +378,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
 
   expv *= (inty == 1) & !xpos ? -1.0 : 1.0;
 
-  long ret = as_long(expv);
+  long ret = __clc_as_long(expv);
 
   // Now all the edge cases
   ret = !xpos & (inty == 0) ? QNANBITPATT_DP64 : ret;
@@ -410,7 +412,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pow(double x, double y) {
   ret = ay == 0L ? 0x3ff0000000000000L : ret;
   ret = ux == 0x3ff0000000000000L ? 0x3ff0000000000000L : ret;
 
-  return as_double(ret);
+  return __clc_as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pow, double, double)
 #endif
diff --git a/libclc/generic/lib/math/clc_pown.cl b/libclc/generic/lib/math/clc_pown.cl
index a0f968c238e99..a613b2998c3f6 100644
--- a/libclc/generic/lib/math/clc_pown.cl
+++ b/libclc/generic/lib/math/clc_pown.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
@@ -67,17 +68,17 @@
 _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
   float y = (float)ny;
 
-  int ix = as_int(x);
+  int ix = __clc_as_int(x);
   int ax = ix & EXSIGNBIT_SP32;
   int xpos = ix == ax;
 
-  int iy = as_int(y);
+  int iy = __clc_as_int(y);
   int ay = iy & EXSIGNBIT_SP32;
   int ypos = iy == ay;
 
   // Extra precise log calculation
   // First handle case that x is close to 1
-  float r = 1.0f - as_float(ax);
+  float r = 1.0f - __clc_as_float(ax);
   int near1 = __clc_fabs(r) < 0x1.0p-4f;
   float r2 = r * r;
 
@@ -101,7 +102,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
   // Computations for x not near 1
   int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
   float mf = (float)m;
-  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  int ixs = __clc_as_int(__clc_as_float(ax | 0x3f800000) - 1.0f);
   float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
   int c = m == -127;
   int ixn = c ? ixs : ax;
@@ -110,8 +111,8 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
   int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
 
   // F - Y
-  float f = as_float(0x3f000000 | indx) -
-            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+  float f = __clc_as_float(0x3f000000 | indx) -
+            __clc_as_float(0x3f000000 | (ixn & MANTBITS_SP32));
 
   indx = indx >> 16;
   float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
@@ -139,10 +140,10 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
   lh = near1 ? lh_near1 : lh;
   l = near1 ? l_near1 : l;
 
-  float gh = as_float(as_int(l) & 0xfffff000);
+  float gh = __clc_as_float(__clc_as_int(l) & 0xfffff000);
   float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
 
-  float yh = as_float(iy & 0xfffff000);
+  float yh = __clc_as_float(iy & 0xfffff000);
 
   float yt = (float)(ny - (int)yh);
 
@@ -153,7 +154,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
   // Extra precise exp of ylogx
   // 64/log2 : 92.332482616893657
   const float R_64_BY_LOG2 = 0x1.715476p+6f;
-  int n = convert_int(ylogx * R_64_BY_LOG2);
+  int n = __clc_convert_int(ylogx * R_64_BY_LOG2);
   float nf = (float)n;
 
   int j = n & 0x3f;
@@ -176,14 +177,14 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
 
   float expylogx =
       __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
-  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-  float texpylogx = as_float(as_int(expylogx) + m2);
+  float sexpylogx = expylogx * __clc_as_float(0x1 << (m + 149));
+  float texpylogx = __clc_as_float(__clc_as_int(expylogx) + m2);
   expylogx = m < -125 ? sexpylogx : texpylogx;
 
   // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
   expylogx = ((ylogx > 0x1.62e430p+6f) |
               (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
-                 ? as_float(PINFBITPATT_SP32)
+                 ? __clc_as_float(PINFBITPATT_SP32)
                  : expylogx;
 
   // Result is 0 if ylogx < -149*log2
@@ -196,9 +197,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
 
   int inty = 2 - (ny & 1);
 
-  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  float signval = __clc_as_float((__clc_as_uint(expylogx) ^ SIGNBIT_SP32));
   expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-  int ret = as_int(expylogx);
+  int ret = __clc_as_int(expylogx);
 
   // Corner case handling
   int xinf = xpos ? PINFBITPATT_SP32 : NINFBITPATT_SP32;
@@ -218,7 +219,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_pown(float x, int ny) {
   ret = ax > PINFBITPATT_SP32 ? ix : ret;
   ret = ny == 0 ? 0x3f800000 : ret;
 
-  return as_float(ret);
+  return __clc_as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_pown, float, int)
 
@@ -229,11 +230,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
 
   double y = (double)ny;
 
-  long ux = as_long(x);
+  long ux = __clc_as_long(x);
   long ax = ux & (~SIGNBIT_DP64);
   int xpos = ax == ux;
 
-  long uy = as_long(y);
+  long uy = __clc_as_long(y);
   long ay = uy & (~SIGNBIT_DP64);
   int ypos = ay == uy;
 
@@ -245,7 +246,8 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
     double xexp = (double)exp;
     long mantissa = ax & 0x000FFFFFFFFFFFFFL;
 
-    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    long temp_ux =
+        __clc_as_long(__clc_as_double(0x3ff0000000000000L | mantissa) - 1.0);
     exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
     double xexp1 = (double)exp;
     long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
@@ -257,14 +259,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
                ((mantissa & 0x0000080000000000) << 1);
     int index = rax >> 44;
 
-    double F = as_double(rax | 0x3FE0000000000000L);
-    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double F = __clc_as_double(rax | 0x3FE0000000000000L);
+    double Y = __clc_as_double(mantissa | 0x3FE0000000000000L);
     double f = F - Y;
     double2 tv = USE_TABLE(log_f_inv_tbl, index);
     double log_h = tv.s0;
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
-    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
     double r2 = fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
@@ -288,15 +290,15 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
     double resT_h = poly0h;
 
     double H = resT + resH;
-    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double H_h = __clc_as_double(__clc_as_long(H) & 0xfffffffff8000000L);
     double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
     H = H_h;
 
-    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
     double y_tail = y - y_head;
 
     int mask_2_24 = ay > 0x4170000000000000; // 2^24
-    int nyh = convert_int(y_head);
+    int nyh = __clc_convert_int(y_head);
     int nyt = ny - nyh;
     double y_tail1 = (double)nyt;
     y_tail = mask_2_24 ? y_tail1 : y_tail;
@@ -344,7 +346,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
     expv = fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
-    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
     expv = v < min_exp_arg ? 0.0 : expv;
   }
 
@@ -357,7 +359,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
 
   expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
 
-  long ret = as_long(expv);
+  long ret = __clc_as_long(expv);
 
   // Now all the edge cases
   long xinf = xpos ? PINFBITPATT_DP64 : NINFBITPATT_DP64;
@@ -378,7 +380,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_pown(double x, int ny) {
   ret = ax > PINFBITPATT_DP64 ? ux : ret;
   ret = ny == 0 ? 0x3ff0000000000000L : ret;
 
-  return as_double(ret);
+  return __clc_as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pown, double, int)
 #endif
@@ -388,7 +390,7 @@ _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_pown, double, int)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 _CLC_OVERLOAD _CLC_DEF half __clc_pown(half x, int y) {
-    return (half)__clc_pown((float)x, y);
+  return (half)__clc_pown((float)x, y);
 }
 
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_pown, half, int);
diff --git a/libclc/generic/lib/math/clc_powr.cl b/libclc/generic/lib/math/clc_powr.cl
index 7e1a6f2a02e7a..7876acaee89a6 100644
--- a/libclc/generic/lib/math/clc_powr.cl
+++ b/libclc/generic/lib/math/clc_powr.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
@@ -65,17 +66,17 @@
 // ((((expT * poly) + expT) + expH*poly) + expH)
 
 _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
-  int ix = as_int(x);
+  int ix = __clc_as_int(x);
   int ax = ix & EXSIGNBIT_SP32;
   int xpos = ix == ax;
 
-  int iy = as_int(y);
+  int iy = __clc_as_int(y);
   int ay = iy & EXSIGNBIT_SP32;
   int ypos = iy == ay;
 
   // Extra precise log calculation
   // First handle case that x is close to 1
-  float r = 1.0f - as_float(ax);
+  float r = 1.0f - __clc_as_float(ax);
   int near1 = __clc_fabs(r) < 0x1.0p-4f;
   float r2 = r * r;
 
@@ -99,7 +100,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
   // Computations for x not near 1
   int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
   float mf = (float)m;
-  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  int ixs = __clc_as_int(__clc_as_float(ax | 0x3f800000) - 1.0f);
   float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
   int c = m == -127;
   int ixn = c ? ixs : ax;
@@ -108,8 +109,8 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
   int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
 
   // F - Y
-  float f = as_float(0x3f000000 | indx) -
-            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+  float f = __clc_as_float(0x3f000000 | indx) -
+            __clc_as_float(0x3f000000 | (ixn & MANTBITS_SP32));
 
   indx = indx >> 16;
   float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
@@ -137,10 +138,10 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
   lh = near1 ? lh_near1 : lh;
   l = near1 ? l_near1 : l;
 
-  float gh = as_float(as_int(l) & 0xfffff000);
+  float gh = __clc_as_float(__clc_as_int(l) & 0xfffff000);
   float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
 
-  float yh = as_float(iy & 0xfffff000);
+  float yh = __clc_as_float(iy & 0xfffff000);
 
   float yt = y - yh;
 
@@ -151,7 +152,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
   // Extra precise exp of ylogx
   // 64/log2 : 92.332482616893657
   const float R_64_BY_LOG2 = 0x1.715476p+6f;
-  int n = convert_int(ylogx * R_64_BY_LOG2);
+  int n = __clc_convert_int(ylogx * R_64_BY_LOG2);
   float nf = (float)n;
 
   int j = n & 0x3f;
@@ -173,14 +174,14 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
 
   float expylogx =
       __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
-  float sexpylogx = expylogx * as_float(0x1 << (m + 149));
-  float texpylogx = as_float(as_int(expylogx) + m2);
+  float sexpylogx = expylogx * __clc_as_float(0x1 << (m + 149));
+  float texpylogx = __clc_as_float(__clc_as_int(expylogx) + m2);
   expylogx = m < -125 ? sexpylogx : texpylogx;
 
   // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
   expylogx = ((ylogx > 0x1.62e430p+6f) |
               (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
-                 ? as_float(PINFBITPATT_SP32)
+                 ? __clc_as_float(PINFBITPATT_SP32)
                  : expylogx;
 
   // Result is 0 if ylogx < -149*log2
@@ -199,9 +200,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
   inty = yexp < 1 ? 0 : inty;
   inty = yexp > 24 ? 2 : inty;
 
-  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  float signval = __clc_as_float((__clc_as_uint(expylogx) ^ SIGNBIT_SP32));
   expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-  int ret = as_int(expylogx);
+  int ret = __clc_as_int(expylogx);
 
   // Corner case handling
   ret = ax < 0x3f800000 & iy == NINFBITPATT_SP32 ? PINFBITPATT_SP32 : ret;
@@ -223,7 +224,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_powr(float x, float y) {
   ret = ax > PINFBITPATT_SP32 ? ix : ret;
   ret = ay > PINFBITPATT_SP32 ? iy : ret;
 
-  return as_float(ret);
+  return __clc_as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_powr, float, float)
 
@@ -232,11 +233,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
   const double real_log2_tail = 5.76999904754328540596e-08;
   const double real_log2_lead = 6.93147122859954833984e-01;
 
-  long ux = as_long(x);
+  long ux = __clc_as_long(x);
   long ax = ux & (~SIGNBIT_DP64);
   int xpos = ax == ux;
 
-  long uy = as_long(y);
+  long uy = __clc_as_long(y);
   long ay = uy & (~SIGNBIT_DP64);
   int ypos = ay == uy;
 
@@ -248,7 +249,8 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
     double xexp = (double)exp;
     long mantissa = ax & 0x000FFFFFFFFFFFFFL;
 
-    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    long temp_ux =
+        __clc_as_long(__clc_as_double(0x3ff0000000000000L | mantissa) - 1.0);
     exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
     double xexp1 = (double)exp;
     long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
@@ -260,14 +262,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
                ((mantissa & 0x0000080000000000) << 1);
     int index = rax >> 44;
 
-    double F = as_double(rax | 0x3FE0000000000000L);
-    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double F = __clc_as_double(rax | 0x3FE0000000000000L);
+    double Y = __clc_as_double(mantissa | 0x3FE0000000000000L);
     double f = F - Y;
     double2 tv = USE_TABLE(log_f_inv_tbl, index);
     double log_h = tv.s0;
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
-    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
     double r2 = fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
@@ -291,11 +293,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
     double resT_h = poly0h;
 
     double H = resT + resH;
-    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double H_h = __clc_as_double(__clc_as_long(H) & 0xfffffffff8000000L);
     double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
     H = H_h;
 
-    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
     double y_tail = y - y_head;
 
     double temp = fma(y_tail, H, fma(y_head, T, y_tail * T));
@@ -341,7 +343,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
     expv = fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
-    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
     expv = v < min_exp_arg ? 0.0 : expv;
   }
 
@@ -363,7 +365,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
 
   expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
 
-  long ret = as_long(expv);
+  long ret = __clc_as_long(expv);
 
   // Now all the edge cases
   ret = ax < 0x3ff0000000000000L & uy == NINFBITPATT_DP64 ? PINFBITPATT_DP64
@@ -389,7 +391,8 @@ _CLC_DEF _CLC_OVERLOAD double __clc_powr(double x, double y) {
   ret = ax > PINFBITPATT_DP64 ? ux : ret;
   ret = ay > PINFBITPATT_DP64 ? uy : ret;
 
-  return as_double(ret);
+  return __clc_as_double(ret);
 }
-_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_powr, double, double)
+_CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_powr, double,
+                      double)
 #endif
diff --git a/libclc/generic/lib/math/clc_remainder.cl b/libclc/generic/lib/math/clc_remainder.cl
index 31d17d5aaf6b6..6302b9776782f 100644
--- a/libclc/generic/lib/math/clc_remainder.cl
+++ b/libclc/generic/lib/math/clc_remainder.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
@@ -31,19 +32,19 @@
 #include <math/clc_remainder.h>
 
 _CLC_DEF _CLC_OVERLOAD float __clc_remainder(float x, float y) {
-  int ux = as_int(x);
+  int ux = __clc_as_int(x);
   int ax = ux & EXSIGNBIT_SP32;
-  float xa = as_float(ax);
+  float xa = __clc_as_float(ax);
   int sx = ux ^ ax;
   int ex = ax >> EXPSHIFTBITS_SP32;
 
-  int uy = as_int(y);
+  int uy = __clc_as_int(y);
   int ay = uy & EXSIGNBIT_SP32;
-  float ya = as_float(ay);
+  float ya = __clc_as_float(ay);
   int ey = ay >> EXPSHIFTBITS_SP32;
 
-  float xr = as_float(0x3f800000 | (ax & 0x007fffff));
-  float yr = as_float(0x3f800000 | (ay & 0x007fffff));
+  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
+  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
   int c;
   int k = ex - ey;
 
@@ -71,17 +72,17 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remainder(float x, float y) {
   xr -= c ? yr : 0.0f;
   q += c;
 
-  float s = as_float(ey << EXPSHIFTBITS_SP32);
+  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
   xr *= lt ? 1.0f : s;
 
   c = ax == ay;
   xr = c ? 0.0f : xr;
 
-  xr = as_float(sx ^ as_int(xr));
+  xr = __clc_as_float(sx ^ __clc_as_int(xr));
 
   c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
       ay == 0;
-  xr = c ? as_float(QNANBITPATT_SP32) : xr;
+  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
 
   return xr;
 }
@@ -90,18 +91,18 @@ _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_remainder, float,
 
 #ifdef cl_khr_fp64
 _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
-  ulong ux = as_ulong(x);
+  ulong ux = __clc_as_ulong(x);
   ulong ax = ux & ~SIGNBIT_DP64;
   ulong xsgn = ux ^ ax;
-  double dx = as_double(ax);
-  int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
+  double dx = __clc_as_double(ax);
+  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
   int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
   xexp1 = xexp < 1 ? xexp1 : xexp;
 
-  ulong uy = as_ulong(y);
+  ulong uy = __clc_as_ulong(y);
   ulong ay = uy & ~SIGNBIT_DP64;
-  double dy = as_double(ay);
-  int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
+  double dy = __clc_as_double(ay);
+  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
   int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
   yexp1 = yexp < 1 ? yexp1 : yexp;
 
@@ -181,12 +182,12 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
 
   dx = dy < 0x1.0p+1022 ? dxl : dxg;
 
-  double ret = as_double(xsgn ^ as_ulong(dx));
-  dx = as_double(ax);
+  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
+  dx = __clc_as_double(ax);
 
   // Now handle |x| == |y|
   int c = dx == dy;
-  t = as_double(xsgn);
+  t = __clc_as_double(xsgn);
   ret = c ? t : ret;
 
   // Next, handle |x| < |y|
@@ -203,7 +204,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
 
   // |y| is 0
   c = dy == 0.0;
-  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
 
   // y is +-Inf, NaN
   c = yexp > BIASEDEMAX_DP64;
@@ -212,7 +213,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
 
   // x is +=Inf, NaN
   c = xexp > BIASEDEMAX_DP64;
-  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
 
   return ret;
 }
diff --git a/libclc/generic/lib/math/clc_remquo.cl b/libclc/generic/lib/math/clc_remquo.cl
index af3e7a2b07500..699517e180708 100644
--- a/libclc/generic/lib/math/clc_remquo.cl
+++ b/libclc/generic/lib/math/clc_remquo.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
@@ -34,20 +35,20 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
                                           __private int *quo) {
   x = __clc_flush_denormal_if_not_supported(x);
   y = __clc_flush_denormal_if_not_supported(y);
-  int ux = as_int(x);
+  int ux = __clc_as_int(x);
   int ax = ux & EXSIGNBIT_SP32;
-  float xa = as_float(ax);
+  float xa = __clc_as_float(ax);
   int sx = ux ^ ax;
   int ex = ax >> EXPSHIFTBITS_SP32;
 
-  int uy = as_int(y);
+  int uy = __clc_as_int(y);
   int ay = uy & EXSIGNBIT_SP32;
-  float ya = as_float(ay);
+  float ya = __clc_as_float(ay);
   int sy = uy ^ ay;
   int ey = ay >> EXPSHIFTBITS_SP32;
 
-  float xr = as_float(0x3f800000 | (ax & 0x007fffff));
-  float yr = as_float(0x3f800000 | (ay & 0x007fffff));
+  float xr = __clc_as_float(0x3f800000 | (ax & 0x007fffff));
+  float yr = __clc_as_float(0x3f800000 | (ay & 0x007fffff));
   int c;
   int k = ex - ey;
 
@@ -75,7 +76,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
   xr -= c ? yr : 0.0f;
   q += c;
 
-  float s = as_float(ey << EXPSHIFTBITS_SP32);
+  float s = __clc_as_float(ey << EXPSHIFTBITS_SP32);
   xr *= lt ? 1.0f : s;
 
   int qsgn = sx == sy ? 1 : -1;
@@ -85,12 +86,12 @@ _CLC_DEF _CLC_OVERLOAD float __clc_remquo(float x, float y,
   quot = c ? qsgn : quot;
   xr = c ? 0.0f : xr;
 
-  xr = as_float(sx ^ as_int(xr));
+  xr = __clc_as_float(sx ^ __clc_as_int(xr));
 
   c = ax > PINFBITPATT_SP32 | ay > PINFBITPATT_SP32 | ax == PINFBITPATT_SP32 |
       ay == 0;
   quot = c ? 0 : quot;
-  xr = c ? as_float(QNANBITPATT_SP32) : xr;
+  xr = c ? __clc_as_float(QNANBITPATT_SP32) : xr;
 
   *quo = quot;
 
@@ -130,18 +131,18 @@ __VEC_REMQUO(float, 16, 8)
 #ifdef cl_khr_fp64
 _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
                                            __private int *pquo) {
-  ulong ux = as_ulong(x);
+  ulong ux = __clc_as_ulong(x);
   ulong ax = ux & ~SIGNBIT_DP64;
   ulong xsgn = ux ^ ax;
-  double dx = as_double(ax);
-  int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
+  double dx = __clc_as_double(ax);
+  int xexp = __clc_convert_int(ax >> EXPSHIFTBITS_DP64);
   int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
   xexp1 = xexp < 1 ? xexp1 : xexp;
 
-  ulong uy = as_ulong(y);
+  ulong uy = __clc_as_ulong(y);
   ulong ay = uy & ~SIGNBIT_DP64;
-  double dy = as_double(ay);
-  int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
+  double dy = __clc_as_double(ay);
+  int yexp = __clc_convert_int(ay >> EXPSHIFTBITS_DP64);
   int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
   yexp1 = yexp < 1 ? yexp1 : yexp;
 
@@ -223,12 +224,12 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
   lt += dy < 0x1.0p+1022 ? al : ag;
   int quo = ((int)lt & 0x7f) * qsgn;
 
-  double ret = as_double(xsgn ^ as_ulong(dx));
-  dx = as_double(ax);
+  double ret = __clc_as_double(xsgn ^ __clc_as_ulong(dx));
+  dx = __clc_as_double(ax);
 
   // Now handle |x| == |y|
   int c = dx == dy;
-  t = as_double(xsgn);
+  t = __clc_as_double(xsgn);
   quo = c ? qsgn : quo;
   ret = c ? t : ret;
 
@@ -249,7 +250,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
   // |y| is 0
   c = dy == 0.0;
   quo = c ? 0 : quo;
-  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
 
   // y is +-Inf, NaN
   c = yexp > BIASEDEMAX_DP64;
@@ -260,7 +261,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
   // x is +=Inf, NaN
   c = xexp > BIASEDEMAX_DP64;
   quo = c ? 0 : quo;
-  ret = c ? as_double(QNANBITPATT_DP64) : ret;
+  ret = c ? __clc_as_double(QNANBITPATT_DP64) : ret;
 
   *pquo = quo;
   return ret;
diff --git a/libclc/generic/lib/math/clc_rootn.cl b/libclc/generic/lib/math/clc_rootn.cl
index 42b983784c14d..dabaa2a4f3f2a 100644
--- a/libclc/generic/lib/math/clc_rootn.cl
+++ b/libclc/generic/lib/math/clc_rootn.cl
@@ -21,6 +21,7 @@
  */
 
 #include <clc/clc.h>
+#include <clc/clc_convert.h>
 #include <clc/clcmacro.h>
 #include <clc/math/clc_fabs.h>
 #include <clc/math/clc_mad.h>
@@ -67,17 +68,17 @@
 _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
   float y = MATH_RECIP((float)ny);
 
-  int ix = as_int(x);
+  int ix = __clc_as_int(x);
   int ax = ix & EXSIGNBIT_SP32;
   int xpos = ix == ax;
 
-  int iy = as_int(y);
+  int iy = __clc_as_int(y);
   int ay = iy & EXSIGNBIT_SP32;
   int ypos = iy == ay;
 
   // Extra precise log calculation
   // First handle case that x is close to 1
-  float r = 1.0f - as_float(ax);
+  float r = 1.0f - __clc_as_float(ax);
   int near1 = __clc_fabs(r) < 0x1.0p-4f;
   float r2 = r * r;
 
@@ -101,7 +102,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
   // Computations for x not near 1
   int m = (int)(ax >> EXPSHIFTBITS_SP32) - EXPBIAS_SP32;
   float mf = (float)m;
-  int ixs = as_int(as_float(ax | 0x3f800000) - 1.0f);
+  int ixs = __clc_as_int(__clc_as_float(ax | 0x3f800000) - 1.0f);
   float mfs = (float)((ixs >> EXPSHIFTBITS_SP32) - 253);
   int c = m == -127;
   int ixn = c ? ixs : ax;
@@ -110,8 +111,8 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
   int indx = (ixn & 0x007f0000) + ((ixn & 0x00008000) << 1);
 
   // F - Y
-  float f = as_float(0x3f000000 | indx) -
-            as_float(0x3f000000 | (ixn & MANTBITS_SP32));
+  float f = __clc_as_float(0x3f000000 | indx) -
+            __clc_as_float(0x3f000000 | (ixn & MANTBITS_SP32));
 
   indx = indx >> 16;
   float2 tv = USE_TABLE(log_inv_tbl_ep, indx);
@@ -139,13 +140,13 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
   lh = near1 ? lh_near1 : lh;
   l = near1 ? l_near1 : l;
 
-  float gh = as_float(as_int(l) & 0xfffff000);
+  float gh = __clc_as_float(__clc_as_int(l) & 0xfffff000);
   float gt = ((ltt - (lt - lth)) + ((lh - l) + lt)) + (l - gh);
 
-  float yh = as_float(iy & 0xfffff000);
+  float yh = __clc_as_float(iy & 0xfffff000);
 
   float fny = (float)ny;
-  float fnyh = as_float(as_int(fny) & 0xfffff000);
+  float fnyh = __clc_as_float(__clc_as_int(fny) & 0xfffff000);
   float fnyt = (float)(ny - (int)fnyh);
   float yt = MATH_DIVIDE(__clc_mad(-fnyt, yh, __clc_mad(-fnyh, yh, 1.0f)), fny);
 
@@ -155,7 +156,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
 
   // Extra precise exp of ylogx
   const float R_64_BY_LOG2 = 0x1.715476p+6f; // 64/log2 : 92.332482616893657
-  int n = convert_int(ylogx * R_64_BY_LOG2);
+  int n = __clc_convert_int(ylogx * R_64_BY_LOG2);
   float nf = (float)n;
 
   int j = n & 0x3f;
@@ -179,16 +180,16 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
   float expylogx =
       __clc_mad(tv.s0, poly, __clc_mad(tv.s1, poly, tv.s1)) + tv.s0;
   float sexpylogx = __clc_fp32_subnormals_supported()
-                        ? expylogx * as_float(0x1 << (m + 149))
+                        ? expylogx * __clc_as_float(0x1 << (m + 149))
                         : 0.0f;
 
-  float texpylogx = as_float(as_int(expylogx) + m2);
+  float texpylogx = __clc_as_float(__clc_as_int(expylogx) + m2);
   expylogx = m < -125 ? sexpylogx : texpylogx;
 
   // Result is +-Inf if (ylogx + ylogx_t) > 128*log2
   expylogx = ((ylogx > 0x1.62e430p+6f) |
               (ylogx == 0x1.62e430p+6f & ylogx_t > -0x1.05c610p-22f))
-                 ? as_float(PINFBITPATT_SP32)
+                 ? __clc_as_float(PINFBITPATT_SP32)
                  : expylogx;
 
   // Result is 0 if ylogx < -149*log2
@@ -201,9 +202,9 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
 
   int inty = 2 - (ny & 1);
 
-  float signval = as_float((as_uint(expylogx) ^ SIGNBIT_SP32));
+  float signval = __clc_as_float((__clc_as_uint(expylogx) ^ SIGNBIT_SP32));
   expylogx = ((inty == 1) & !xpos) ? signval : expylogx;
-  int ret = as_int(expylogx);
+  int ret = __clc_as_int(expylogx);
 
   // Corner case handling
   ret = (!xpos & (inty == 2)) ? QNANBITPATT_SP32 : ret;
@@ -221,7 +222,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_rootn(float x, int ny) {
   ret = ax > PINFBITPATT_SP32 ? ix : ret;
   ret = ny == 0 ? QNANBITPATT_SP32 : ret;
 
-  return as_float(ret);
+  return __clc_as_float(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, float, __clc_rootn, float, int)
 
@@ -233,11 +234,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
   double dny = (double)ny;
   double y = 1.0 / dny;
 
-  long ux = as_long(x);
+  long ux = __clc_as_long(x);
   long ax = ux & (~SIGNBIT_DP64);
   int xpos = ax == ux;
 
-  long uy = as_long(y);
+  long uy = __clc_as_long(y);
   long ay = uy & (~SIGNBIT_DP64);
   int ypos = ay == uy;
 
@@ -249,7 +250,8 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
     double xexp = (double)exp;
     long mantissa = ax & 0x000FFFFFFFFFFFFFL;
 
-    long temp_ux = as_long(as_double(0x3ff0000000000000L | mantissa) - 1.0);
+    long temp_ux =
+        __clc_as_long(__clc_as_double(0x3ff0000000000000L | mantissa) - 1.0);
     exp = ((temp_ux & 0x7FF0000000000000L) >> 52) - 2045;
     double xexp1 = (double)exp;
     long mantissa1 = temp_ux & 0x000FFFFFFFFFFFFFL;
@@ -261,14 +263,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
                ((mantissa & 0x0000080000000000) << 1);
     int index = rax >> 44;
 
-    double F = as_double(rax | 0x3FE0000000000000L);
-    double Y = as_double(mantissa | 0x3FE0000000000000L);
+    double F = __clc_as_double(rax | 0x3FE0000000000000L);
+    double Y = __clc_as_double(mantissa | 0x3FE0000000000000L);
     double f = F - Y;
     double2 tv = USE_TABLE(log_f_inv_tbl, index);
     double log_h = tv.s0;
     double log_t = tv.s1;
     double f_inv = (log_h + log_t) * f;
-    double r1 = as_double(as_long(f_inv) & 0xfffffffff8000000L);
+    double r1 = __clc_as_double(__clc_as_long(f_inv) & 0xfffffffff8000000L);
     double r2 = fma(-F, r1, f) * (log_h + log_t);
     double r = r1 + r2;
 
@@ -292,14 +294,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
     double resT_h = poly0h;
 
     double H = resT + resH;
-    double H_h = as_double(as_long(H) & 0xfffffffff8000000L);
+    double H_h = __clc_as_double(__clc_as_long(H) & 0xfffffffff8000000L);
     double T = (resH - H + resT) + (resT_t - (resT + resT_h)) + (H - H_h);
     H = H_h;
 
-    double y_head = as_double(uy & 0xfffffffff8000000L);
+    double y_head = __clc_as_double(uy & 0xfffffffff8000000L);
     double y_tail = y - y_head;
 
-    double fnyh = as_double(as_long(dny) & 0xfffffffffff00000);
+    double fnyh = __clc_as_double(__clc_as_long(dny) & 0xfffffffffff00000);
     double fnyt = (double)(ny - (int)fnyh);
     y_tail = fma(-fnyt, y_head, fma(-fnyh, y_head, 1.0)) / dny;
 
@@ -346,7 +348,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
     expv = fma(f, q, f2) + f1;
     expv = ldexp(expv, m);
 
-    expv = v > max_exp_arg ? as_double(0x7FF0000000000000L) : expv;
+    expv = v > max_exp_arg ? __clc_as_double(0x7FF0000000000000L) : expv;
     expv = v < min_exp_arg ? 0.0 : expv;
   }
 
@@ -359,7 +361,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
 
   expv *= ((inty == 1) & !xpos) ? -1.0 : 1.0;
 
-  long ret = as_long(expv);
+  long ret = __clc_as_long(expv);
 
   // Now all the edge cases
   ret = (!xpos & (inty == 2)) ? QNANBITPATT_DP64 : ret;
@@ -377,7 +379,7 @@ _CLC_DEF _CLC_OVERLOAD double __clc_rootn(double x, int ny) {
   ret = ((ux == PINFBITPATT_DP64) & ypos) ? PINFBITPATT_DP64 : ret;
   ret = ax > PINFBITPATT_DP64 ? ux : ret;
   ret = ny == 0 ? QNANBITPATT_DP64 : ret;
-  return as_double(ret);
+  return __clc_as_double(ret);
 }
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
 #endif
@@ -387,7 +389,7 @@ _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_rootn, double, int)
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 _CLC_OVERLOAD _CLC_DEF half __clc_rootn(half x, int y) {
-    return (half)__clc_rootn((float)x, y);
+  return (half)__clc_rootn((float)x, y);
 }
 
 _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __clc_rootn, half, int);
diff --git a/libcxx/docs/DesignDocs/ExperimentalFeatures.rst b/libcxx/docs/DesignDocs/ExperimentalFeatures.rst
index 0dbbd5f869e36..f9b23493b2356 100644
--- a/libcxx/docs/DesignDocs/ExperimentalFeatures.rst
+++ b/libcxx/docs/DesignDocs/ExperimentalFeatures.rst
@@ -161,7 +161,9 @@ has been removed in LLVM 17.0.
 `Networking TS <https://wg21.link/N4656>`__
 -------------------------------------------
 The Networking TS is not yet part of a shipping standard, and there is discussion around removing it.
-Libc++ never shipped an implementation of the Networking TS and does not plan to do so in the future.
+Libc++ never shipped an implementation of the Networking TS and does not plan to do so in the future,
+unless the C++ Standards Committee expresses a desire to merge the Networking TS into the IS (which is
+unlikely at this point).
 
 `Ranges TS <https://wg21.link/N4685>`__
 ---------------------------------------
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 45faea0568b2e..1ec23dfabd5ea 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -111,6 +111,16 @@
 "`LWG4169 <https://wg21.link/LWG4169>`__","``std::atomic<T>``'s default constructor should be constrained","2024-11 (Wrocław)","","",""
 "`LWG4170 <https://wg21.link/LWG4170>`__","``contiguous_iterator`` should require ``to_address(I{})``","2024-11 (Wrocław)","","",""
 "","","","","",""
+"`LWG3578 <https://wg21.link/3578>`__","Iterator SCARYness in the context of associative container merging","2025-02 (Hagenberg)","","",""
+"`LWG3956 <https://wg21.link/3956>`__","``chrono::parse`` uses ``from_stream`` as a customization point","2025-02 (Hagenberg)","","",""
+"`LWG4172 <https://wg21.link/4172>`__","``unique_lock`` self-move-assignment is broken","2025-02 (Hagenberg)","","",""
+"`LWG4175 <https://wg21.link/4175>`__","``get_env()`` specified in terms of ``as_const()`` but this doesn't work with rvalue senders","2025-02 (Hagenberg)","","",""
+"`LWG4179 <https://wg21.link/4179>`__","Wrong range in ``[alg.search]``","2025-02 (Hagenberg)","","",""
+"`LWG4186 <https://wg21.link/4186>`__","``regex_traits::transform_primary`` mistakenly detects ``typeid`` of a function","2025-02 (Hagenberg)","","",""
+"`LWG4189 <https://wg21.link/4189>`__","``cache_latest_view`` should be freestanding","2025-02 (Hagenberg)","","",""
+"`LWG4191 <https://wg21.link/4191>`__","P1467 changed the return type of ``pow(complex<float>, int)``","2025-02 (Hagenberg)","","",""
+"`LWG4196 <https://wg21.link/4196>`__","Complexity of ``inplace_merge()`` is incorrect","2025-02 (Hagenberg)","","",""
+"","","","","",""
 "`LWG3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16",""
 "`LWG4139 <https://wg21.link/LWG4139>`__","§[time.zone.leap] recursive constraint in <=>","Not Adopted Yet","|Complete|","20",""
 "`LWG3456 <https://wg21.link/LWG3456>`__","Pattern used by std::from_chars is underspecified (option B)","Not Adopted Yet","|Complete|","20",""
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index b2bb1d6e9d6c3..1436db6cf2b45 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -79,7 +79,6 @@
 "`P3136R1 <https://wg21.link/P3136R1>`__","Retiring niebloids","2024-11 (Wrocław)","","",""
 "`P3138R5 <https://wg21.link/P3138R5>`__","``views::cache_latest``","2024-11 (Wrocław)","","",""
 "`P3379R0 <https://wg21.link/P3379R0>`__","Constrain ``std::expected`` equality operators","2024-11 (Wrocław)","","",""
-"`P0472R2 <https://wg21.link/P0472R2>`__","Put ``std::monostate`` in ``<utility>``","2024-11 (Wrocław)","","",""
 "`P2862R1 <https://wg21.link/P2862R1>`__","``text_encoding::name()`` should never return null values","2024-11 (Wrocław)","","",""
 "`P2897R7 <https://wg21.link/P2897R7>`__","``aligned_accessor``: An ``mdspan`` accessor expressing pointer over-alignment","2024-11 (Wrocław)","","",""
 "`P3355R1 <https://wg21.link/P3355R1>`__","Fix ``submdspan`` for C++26","2024-11 (Wrocław)","","",""
@@ -92,9 +91,29 @@
 "`P3369R0 <https://wg21.link/P3369R0>`__","constexpr for ``uninitialized_default_construct``","2024-11 (Wrocław)","","",""
 "`P3370R1 <https://wg21.link/P3370R1>`__","Add new library headers from C23","2024-11 (Wrocław)","","",""
 "`P3309R3 <https://wg21.link/P3309R3>`__","constexpr ``atomic`` and ``atomic_ref``","2024-11 (Wrocław)","","",""
-"`P3019R11 <https://wg21.link/P3019R11>`__","``indirect`` and ``polymorphic``: Vocabulary Types for Composite Class Design","2024-11 (Wrocław)","","",""
 "`P1928R15 <https://wg21.link/P1928R15>`__","``std::simd`` — merge data-parallel types from the Parallelism TS 2","2024-11 (Wrocław)","","",""
 "`P3325R5 <https://wg21.link/P3325R5>`__","A Utility for Creating Execution Environments","2024-11 (Wrocław)","","",""
 "`P3068R6 <https://wg21.link/P3068R6>`__","Allowing exception throwing in constant-evaluation","2024-11 (Wrocław)","","",""
 "`P3247R2 <https://wg21.link/P3247R2>`__","Deprecate the notion of trivial types","2024-11 (Wrocław)","","",""
 "","","","","",""
+"`P3074R7 <https://wg21.link/P3074R7>`__","trivial unions (was ``std::uninitialized``)","2025-02 (Hagenberg)","","",""
+"`P1494R5 <https://wg21.link/P1494R5>`__","Partial program correctness","2025-02 (Hagenberg)","","",""
+"`P2900R14 <https://wg21.link/P2900R14>`__","Contracts for C++","2025-02 (Hagenberg)","","",""
+"`P3475R2 <https://wg21.link/P3475R2>`__","Defang and deprecate ``memory_order::consume``","2025-02 (Hagenberg)","","",""
+"`P2786R13 <https://wg21.link/P2786R13>`__","Trivial Relocatability For C++26","2025-02 (Hagenberg)","","",""
+"`P3137R3 <https://wg21.link/P3137R3>`__","``views::to_input``","2025-02 (Hagenberg)","","",""
+"`P0472R3 <https://wg21.link/P0472R3>`__","Put ``std::monostate`` in ``<utility>``","2025-02 (Hagenberg)","","",""
+"`P3349R1 <https://wg21.link/P3349R1>`__","Converting contiguous iterators to pointers","2025-02 (Hagenberg)","","",""
+"`P3372R3 <https://wg21.link/P3372R3>`__","constexpr containers and adaptors","2025-02 (Hagenberg)","","",""
+"`P3378R2 <https://wg21.link/P3378R2>`__","constexpr exception types","2025-02 (Hagenberg)","","",""
+"`P3441R2 <https://wg21.link/P3441R2>`__","Rename ``simd_split`` to ``simd_chunk``","2025-02 (Hagenberg)","","",""
+"`P3287R3 <https://wg21.link/P3287R3>`__","Exploration of namespaces for ``std::simd``","2025-02 (Hagenberg)","","",""
+"`P2976R1 <https://wg21.link/P2976R1>`__","Freestanding Library: ``algorithm``, ``numeric``, and ``random``","2025-02 (Hagenberg)","","",""
+"`P3430R3 <https://wg21.link/P3430R3>`__","simd issues: explicit, unsequenced, identity-element position, and members of disabled simd","2025-02 (Hagenberg)","","",""
+"`P2663R7 <https://wg21.link/P2663R7>`__","Interleaved complex values support in ``std::simd``","2025-02 (Hagenberg)","","",""
+"`P2933R4 <https://wg21.link/P2933R4>`__","Extend ``<bit>`` header function with overloads for ``std::simd``","2025-02 (Hagenberg)","","",""
+"`P2846R6 <https://wg21.link/P2846R6>`__","``reserve_hint``: Eagerly reserving memory for not-quite-sized lazy ranges","2025-02 (Hagenberg)","","",""
+"`P3471R4 <https://wg21.link/P3471R4>`__","Standard Library Hardening","2025-02 (Hagenberg)","","",""
+"`P0447R28 <https://wg21.link/P0447R28>`__","Introduction of ``std::hive`` to the standard library","2025-02 (Hagenberg)","","",""
+"`P3019R14 <https://wg21.link/P3019R14>`__","``indirect`` and ``polymorphic``: Vocabulary Types for Composite Class Design","2025-02 (Hagenberg)","","",""
+"","","","","",""
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 53c6b84c22ea7..2c4b865050306 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -128,14 +128,14 @@ Libc++ aims to support common compilers that implement the C++11 Standard. In or
 good balance between stability for users and maintenance cost, testing coverage and development
 velocity, libc++ drops support for older compilers as newer ones are released.
 
-============ =============== ========================== =====================
-Compiler     Versions        Restrictions               Support policy
-============ =============== ========================== =====================
-Clang        17, 18, 19-git                             latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
-AppleClang   15                                         latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
-Open XL      17.1 (AIX)                                 latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
-GCC          14              In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
-============ =============== ========================== =====================
+============ =================== ========================== =====================
+Compiler     Versions            Restrictions               Support policy
+============ =================== ========================== =====================
+Clang        18, 19, 20, 21-git                             latest two stable releases per `LLVM's release page <https://releases.llvm.org>`_ and the development version
+AppleClang   15                                             latest stable release per `Xcode's release page <https://developer.apple.com/documentation/xcode-release-notes>`_
+Open XL      17.1 (AIX)                                     latest stable release per `Open XL's documentation page <https://www.ibm.com/docs/en/openxl-c-and-cpp-aix>`_
+GCC          14                  In C++11 or later only     latest stable release per `GCC's release page <https://gcc.gnu.org/releases.html>`_
+============ =================== ========================== =====================
 
 Libc++ also supports common platforms and architectures:
 
diff --git a/libcxx/src/experimental/tzdb.cpp b/libcxx/src/experimental/tzdb.cpp
index f38f495c2d0bb..1f18226636fd5 100644
--- a/libcxx/src/experimental/tzdb.cpp
+++ b/libcxx/src/experimental/tzdb.cpp
@@ -708,6 +708,39 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
   std::__throw_runtime_error("unknown time zone");
 }
 #else  // ifdef _WIN32
+
+[[nodiscard]] static string __current_zone_environment() {
+  if (const char* __tz = std::getenv("TZ"))
+    return __tz;
+
+  return {};
+}
+
+[[nodiscard]] static string __current_zone_etc_localtime() {
+  filesystem::path __path = "/etc/localtime";
+  if (!filesystem::exists(__path) || !filesystem::is_symlink(__path))
+    return {};
+
+  filesystem::path __tz = filesystem::read_symlink(__path);
+  // The path may be a relative path, in that case convert it to an absolute
+  // path based on the proper initial directory.
+  if (__tz.is_relative())
+    __tz = filesystem::canonical("/etc" / __tz);
+
+  return filesystem::relative(__tz, "/usr/share/zoneinfo/");
+}
+
+[[nodiscard]] static string __current_zone_etc_timezone() {
+  filesystem::path __path = "/etc/timezone";
+  if (!filesystem::exists(__path))
+    return {};
+
+  ifstream __f(__path);
+  string __name;
+  std::getline(__f, __name);
+  return __name;
+}
+
 [[nodiscard]] static const time_zone* __current_zone_posix(const tzdb& tzdb) {
   // On POSIX systems there are several ways to configure the time zone.
   // In order of priority they are:
@@ -726,30 +759,28 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
   //
   // - The time zone name is the target of the symlink /etc/localtime
   //   relative to /usr/share/zoneinfo/
+  //
+  // - The file /etc/timezone. This text file contains the name of the time
+  //   zone.
+  //
+  // On Linux systems it seems /etc/timezone is deprecated and being phased
+  // out. This file is used when /etc/localtime does not exist, or when it exists but is not a symlink. For more information and links see
+  // https://github.com/llvm/llvm-project/issues/105634
 
-  // The algorithm is like this:
-  // - If the environment variable TZ is set and points to a valid
-  //   record use this value.
-  // - Else use the name based on the `/etc/localtime` symlink.
+  string __name = chrono::__current_zone_environment();
 
-  if (const char* __tz = getenv("TZ"))
-    if (const time_zone* __result = tzdb.__locate_zone(__tz))
+  // Ignore invalid names in the environment.
+  if (!__name.empty())
+    if (const time_zone* __result = tzdb.__locate_zone(__name))
       return __result;
 
-  filesystem::path __path = "/etc/localtime";
-  if (!filesystem::exists(__path))
-    std::__throw_runtime_error("tzdb: the symlink '/etc/localtime' does not exist");
-
-  if (!filesystem::is_symlink(__path))
-    std::__throw_runtime_error("tzdb: the path '/etc/localtime' is not a symlink");
+  __name = chrono::__current_zone_etc_localtime();
+  if (__name.empty())
+    __name = chrono::__current_zone_etc_timezone();
 
-  filesystem::path __tz = filesystem::read_symlink(__path);
-  // The path may be a relative path, in that case convert it to an absolute
-  // path based on the proper initial directory.
-  if (__tz.is_relative())
-    __tz = filesystem::canonical("/etc" / __tz);
+  if (__name.empty())
+    std::__throw_runtime_error("tzdb: unable to determine the name of the current time zone");
 
-  string __name = filesystem::relative(__tz, "/usr/share/zoneinfo/");
   if (const time_zone* __result = tzdb.__locate_zone(__name))
     return __result;
 
diff --git a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
index 6509bb58140ab..dcdce261298c1 100644
--- a/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
+++ b/libcxx/test/libcxx/gdb/gdb_pretty_printer_test.sh.cpp
@@ -12,7 +12,7 @@
 // UNSUPPORTED: c++03
 
 // TODO: Investigate these failures which break the CI.
-// UNSUPPORTED: clang-18, clang-19, clang-20
+// UNSUPPORTED: clang-18, clang-19, clang-20, clang-21
 
 // The Android libc++ tests are run on a non-Android host, connected to an
 // Android device over adb. gdb needs special support to make this work (e.g.
diff --git a/lld/test/ELF/lto/arm-rtlibcall.ll b/lld/test/ELF/lto/arm-rtlibcall.ll
new file mode 100644
index 0000000000000..b254fa0c034e8
--- /dev/null
+++ b/lld/test/ELF/lto/arm-rtlibcall.ll
@@ -0,0 +1,126 @@
+; REQUIRES: arm
+;; Test for LTO optimizing out references to symbols that are pulled in by
+;; compiler-generated libcalls (post LTO).
+;; Lazy files extracted post LTO compilation might reference other lazy files.
+;; Referenced relocatable files are extracted and everything works as intended.
+;; However, if the referenced lazy file is a bitcode file, no further LTO
+;; compilation occurs. lld currently treats any symbols from that bitcode file
+;; as absolute, which leads to a "refer to absolute symbol" error in PIC links
+;; and leads to silently broken output. For example, lazy aeabi_ldivmod.o post
+;; LTO extraction might call __divmoddi4 defined in an unextracted lazy bitcode
+;; file (https://github.com/llvm/llvm-project/issues/127284).
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: llvm-as divmoddi4.ll -o divmoddi4.bc
+; RUN: llvm-mc -filetype=obj -triple=armv7-none-unknown-eabi aeabi_ldivmod.s -o aeabi_ldivmod.o
+;; With an explicit __aebi_ldivmod call in the input IR this works as expected:
+; RUN: llvm-as main-explicit.ll -o main-explicit-ldivmod.bc
+; RUN: ld.lld main-explicit-ldivmod.bc --start-lib aeabi_ldivmod.o divmoddi4.bc --end-lib -o out.explicit
+; RUN: llvm-objdump -d -r -t out.explicit | FileCheck %s --check-prefix=GOOD-DUMP
+; GOOD-DUMP-LABEL: SYMBOL TABLE:
+; GOOD-DUMP: [[#]] g     F .text	00000024 __aeabi_ldivmod
+; GOOD-DUMP: [[#]] g     F .text	[[#]] __divmoddi4
+; GOOD-DUMP-LABEL: <__aeabi_ldivmod>:
+; GOOD-DUMP:       bl	0x20140 <__divmoddi4>   @ imm = #0x28
+
+;; But if the call is generated by ISel, we end up with an invalid reference:
+; RUN: llvm-as main-implicit.ll -o main-implicit-ldivmod.bc
+; RUN: ld.lld main-implicit-ldivmod.bc --start-lib aeabi_ldivmod.o divmoddi4.bc --end-lib -o out.implicit
+; RUN: llvm-objdump -d -r -t out.implicit | FileCheck %s --check-prefix=BAD-DUMP
+;; We jump to address zero here and __divmoddi4 ends up being an absolute symbol:
+; BAD-DUMP-LABEL: SYMBOL TABLE:
+; BAD-DUMP: [[#]] g     F .text	00000024 __aeabi_ldivmod
+; BAD-DUMP: [[#]] g       *ABS*	00000000 __divmoddi4
+; BAD-DUMP-LABEL: <__aeabi_ldivmod>:
+; BAD-DUMP:       bl	0x0 <__divmoddi4>
+;; Linking with -pie complains about the invalid relocation (and even points back to the source files)
+; RUN: not ld.lld main-implicit-ldivmod.bc --start-lib aeabi_ldivmod.o divmoddi4.bc --end-lib -o out.pie --no-undefined -pie --no-dynamic-linker 2>&1 | FileCheck %s --check-prefix=PIE-ERROR
+PIE-ERROR: ld.lld: error: relocation R_ARM_CALL cannot refer to absolute symbol: __divmoddi4
+PIE-ERROR-NEXT: >>> defined in divmoddi4.bc
+PIE-ERROR-NEXT: >>> referenced by aeabi_ldivmod.o:(__aeabi_ldivmod)
+;; Removing --start-lib/--end-lib also ensures that the reference is retained
+; RUN: ld.lld main-implicit-ldivmod.bc aeabi_ldivmod.o divmoddi4.bc -o out.nolib
+; RUN: llvm-objdump -d -r -t out.nolib | FileCheck %s --check-prefix=GOOD-DUMP
+
+;; Interestingly, just declaring __aeabi_ldivmod is sufficient to not run into this issue.
+; RUN: llvm-as main-declared.ll -o main-declared-ldivmod.bc
+; RUN: ld.lld main-declared-ldivmod.bc --start-lib aeabi_ldivmod.o divmoddi4.bc --end-lib -o out.declared
+; RUN: llvm-objdump -d -r -t out.declared | FileCheck %s --check-prefix=GOOD-DUMP
+
+;--- divmoddi4.ll
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-none-unknown-eabi"
+
+; Adding it to llvm.used does not appears to have any effect!
+;; @llvm.used = appending global [1 x ptr] [ptr @__divmoddi4], section "llvm.metadata"
+
+; Stub version of the real __divmoddi4
+define i64 @__divmoddi4(i64 %a, i64 %b, ptr writeonly %rem) #0 align 32 {
+entry:
+  %sub = sub i64 %a, %b
+  store i64 0, ptr %rem, align 8
+  ret i64 %sub
+}
+
+attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind willreturn memory(argmem: write) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a15" }
+
+;--- aeabi_ldivmod.s
+.syntax unified
+.p2align 2
+.arm
+.globl __aeabi_ldivmod
+.type __aeabi_ldivmod,%function
+__aeabi_ldivmod:
+        push {r6, lr}
+        sub sp, sp, #16
+        add r6, sp, #8
+        str r6, [sp]
+        bl __divmoddi4
+        ldr r2, [sp, #8]
+        ldr r3, [sp, #12]
+        add sp, sp, #16
+        pop {r6, pc}
+.size __aeabi_ldivmod, . - __aeabi_ldivmod
+
+;--- main-implicit.ll
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-none-unknown-eabi"
+
+define dso_local i64 @_start(i64 %num, i64 %denom) local_unnamed_addr #0 {
+entry:
+  %div = sdiv i64 %num, %denom
+  %ret = add i64 %div, 2
+  ret i64 %ret
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a15" }
+
+;--- main-explicit.ll
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-none-unknown-eabi"
+
+declare { i64, i64 } @__aeabi_ldivmod(i64, i64)
+
+define dso_local noundef i64 @_start(i64 noundef %num, i64 noundef %denom) local_unnamed_addr #0 {
+entry:
+  %quotrem = call { i64, i64 } @__aeabi_ldivmod(i64 %num, i64 %denom)
+  %div = extractvalue { i64, i64 } %quotrem, 0
+  %ret = add i64 %div, 2
+  ret i64 %ret
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a15" }
+
+;--- main-declared.ll
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-none-unknown-eabi"
+
+declare { i64, i64 } @__aeabi_ldivmod(i64, i64)
+
+define dso_local i64 @_start(i64 %num, i64 %denom) local_unnamed_addr #0 {
+entry:
+  %div = sdiv i64 %num, %denom
+  %ret = add i64 %div, 2
+  ret i64 %ret
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a15" }
diff --git a/lld/test/MachO/export-options.s b/lld/test/MachO/export-options.s
index 1a5d409e83383..df2f90e3a4f3f 100644
--- a/lld/test/MachO/export-options.s
+++ b/lld/test/MachO/export-options.s
@@ -117,13 +117,15 @@
 ## Check that only string-literal patterns match
 ## Check that comments and blank lines are stripped from symbol list
 # RUN: %lld -dylib %t/symdefs.o -o %t/literal \
-# RUN:     -exported_symbols_list %t/literals.txt
+# RUN:     -exported_symbols_list %t/literals.txt \
+# RUN:     -exported_symbol singleton
 # RUN: llvm-objdump --macho --exports-trie %t/literal | \
 # RUN:     FileCheck --check-prefix=LITERAL %s
 
 # LITERAL-DAG: literal_only
 # LITERAL-DAG: literal_also
 # LITERAL-DAG: globby_also
+# LITERAL-DAG: singleton
 # LITERAL-NOT: globby_only
 
 ## Check that only glob patterns match
@@ -245,7 +247,7 @@ _keep_lazy:
 
 #--- symdefs.s
 
-.globl literal_only, literal_also, globby_only, globby_also
+.globl literal_only, literal_also, globby_only, globby_also, singleton
 literal_only:
   retq
 literal_also:
@@ -254,6 +256,8 @@ globby_only:
   retq
 globby_also:
   retq
+singleton:
+  retq
 
 #--- literals.txt
 
diff --git a/lldb/include/lldb/Breakpoint/BreakpointList.h b/lldb/include/lldb/Breakpoint/BreakpointList.h
index a7399d385f6f0..4a921fadfc066 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointList.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointList.h
@@ -163,8 +163,7 @@ class BreakpointList {
   bool m_is_internal;
 
 public:
-  typedef LockingAdaptedIterable<bp_collection, lldb::BreakpointSP,
-                                 list_adapter, std::recursive_mutex>
+  typedef LockingAdaptedIterable<std::recursive_mutex, bp_collection>
       BreakpointIterable;
   BreakpointIterable Breakpoints() {
     return BreakpointIterable(m_breakpoints, GetMutex());
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
index 34bd309864871..3aef1d658c0e5 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocationCollection.h
@@ -165,8 +165,7 @@ class BreakpointLocationCollection {
   mutable std::mutex m_collection_mutex;
 
 public:
-  typedef AdaptedIterable<collection, lldb::BreakpointLocationSP,
-                          vector_adapter>
+  typedef llvm::iterator_range<collection::const_iterator>
       BreakpointLocationCollectionIterable;
   BreakpointLocationCollectionIterable BreakpointLocations() {
     return BreakpointLocationCollectionIterable(m_break_loc_collection);
diff --git a/lldb/include/lldb/Breakpoint/BreakpointLocationList.h b/lldb/include/lldb/Breakpoint/BreakpointLocationList.h
index f76a8fcfdd7e7..17dc0bfe03148 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointLocationList.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointLocationList.h
@@ -204,8 +204,7 @@ class BreakpointLocationList {
   BreakpointLocationCollection *m_new_location_recorder;
 
 public:
-  typedef AdaptedIterable<collection, lldb::BreakpointLocationSP,
-                          vector_adapter>
+  typedef llvm::iterator_range<collection::const_iterator>
       BreakpointLocationIterable;
 
   BreakpointLocationIterable BreakpointLocations() {
diff --git a/lldb/include/lldb/Breakpoint/WatchpointList.h b/lldb/include/lldb/Breakpoint/WatchpointList.h
index bf87495d79dba..d037d36e64290 100644
--- a/lldb/include/lldb/Breakpoint/WatchpointList.h
+++ b/lldb/include/lldb/Breakpoint/WatchpointList.h
@@ -39,8 +39,7 @@ class WatchpointList {
   ~WatchpointList();
 
   typedef std::list<lldb::WatchpointSP> wp_collection;
-  typedef LockingAdaptedIterable<wp_collection, lldb::WatchpointSP,
-                                 vector_adapter, std::recursive_mutex>
+  typedef LockingAdaptedIterable<std::recursive_mutex, wp_collection>
       WatchpointIterable;
 
   /// Add a Watchpoint to the list.
diff --git a/lldb/include/lldb/Breakpoint/WatchpointResource.h b/lldb/include/lldb/Breakpoint/WatchpointResource.h
index 070d84cff8f26..c1a81fc486eb6 100644
--- a/lldb/include/lldb/Breakpoint/WatchpointResource.h
+++ b/lldb/include/lldb/Breakpoint/WatchpointResource.h
@@ -39,8 +39,7 @@ class WatchpointResource
   void SetType(bool read, bool write);
 
   typedef std::vector<lldb::WatchpointSP> WatchpointCollection;
-  typedef LockingAdaptedIterable<WatchpointCollection, lldb::WatchpointSP,
-                                 vector_adapter, std::mutex>
+  typedef LockingAdaptedIterable<std::mutex, WatchpointCollection>
       WatchpointIterable;
 
   /// Iterate over the watchpoint constituents for this resource
diff --git a/lldb/include/lldb/Core/ModuleList.h b/lldb/include/lldb/Core/ModuleList.h
index 43d931a844740..29b87de88520d 100644
--- a/lldb/include/lldb/Core/ModuleList.h
+++ b/lldb/include/lldb/Core/ModuleList.h
@@ -521,14 +521,13 @@ class ModuleList {
   Notifier *m_notifier = nullptr;
 
 public:
-  typedef LockingAdaptedIterable<collection, lldb::ModuleSP, vector_adapter,
-                                 std::recursive_mutex>
+  typedef LockingAdaptedIterable<std::recursive_mutex, collection>
       ModuleIterable;
   ModuleIterable Modules() const {
     return ModuleIterable(m_modules, GetMutex());
   }
 
-  typedef AdaptedIterable<collection, lldb::ModuleSP, vector_adapter>
+  typedef llvm::iterator_range<collection::const_iterator>
       ModuleIterableNoLocking;
   ModuleIterableNoLocking ModulesNoLocking() const {
     return ModuleIterableNoLocking(m_modules);
diff --git a/lldb/include/lldb/Core/ModuleSpec.h b/lldb/include/lldb/Core/ModuleSpec.h
index 4cbbbfa8a26e1..86be0383f8b47 100644
--- a/lldb/include/lldb/Core/ModuleSpec.h
+++ b/lldb/include/lldb/Core/ModuleSpec.h
@@ -389,8 +389,7 @@ class ModuleSpecList {
   }
 
   typedef std::vector<ModuleSpec> collection;
-  typedef LockingAdaptedIterable<collection, ModuleSpec, vector_adapter,
-                                 std::recursive_mutex>
+  typedef LockingAdaptedIterable<std::recursive_mutex, collection>
       ModuleSpecIterable;
 
   ModuleSpecIterable ModuleSpecs() {
diff --git a/lldb/include/lldb/Core/Telemetry.h b/lldb/include/lldb/Core/Telemetry.h
index 60a7097de5eee..b72556ecaf3c9 100644
--- a/lldb/include/lldb/Core/Telemetry.h
+++ b/lldb/include/lldb/Core/Telemetry.h
@@ -56,17 +56,24 @@ struct LLDBBaseTelemetryInfo : public llvm::telemetry::TelemetryInfo {
   void serialize(llvm::telemetry::Serializer &serializer) const override;
 };
 
-/// The base Telemetry manager instance in LLDB
+/// The base Telemetry manager instance in LLDB.
 /// This class declares additional instrumentation points
 /// applicable to LLDB.
 class TelemetryManager : public llvm::telemetry::Manager {
 public:
+  llvm::Error preDispatch(llvm::telemetry::TelemetryInfo *entry) override;
+
+  virtual llvm::StringRef GetInstanceName() const = 0;
+  static TelemetryManager *getInstance();
+
+protected:
   TelemetryManager(std::unique_ptr<llvm::telemetry::Config> config);
 
-  llvm::Error preDispatch(llvm::telemetry::TelemetryInfo *entry) override;
+  static void setInstance(std::unique_ptr<TelemetryManager> manger);
 
 private:
   std::unique_ptr<llvm::telemetry::Config> m_config;
+  static std::unique_ptr<TelemetryManager> g_instance;
 };
 
 } // namespace telemetry
diff --git a/lldb/include/lldb/Host/common/NativeProcessProtocol.h b/lldb/include/lldb/Host/common/NativeProcessProtocol.h
index 744699210d4b5..1d5fecfcd5c27 100644
--- a/lldb/include/lldb/Host/common/NativeProcessProtocol.h
+++ b/lldb/include/lldb/Host/common/NativeProcessProtocol.h
@@ -51,13 +51,9 @@ class NativeProcessProtocol {
   virtual ~NativeProcessProtocol() = default;
 
   typedef std::vector<std::unique_ptr<NativeThreadProtocol>> thread_collection;
-  template <typename I>
-  static NativeThreadProtocol &thread_list_adapter(I &iter) {
-    assert(*iter);
-    return **iter;
-  }
-  typedef LockingAdaptedIterable<thread_collection, NativeThreadProtocol &,
-                                 thread_list_adapter, std::recursive_mutex>
+  typedef LockingAdaptedIterable<
+      std::recursive_mutex, thread_collection,
+      llvm::pointee_iterator<thread_collection::const_iterator>>
       ThreadIterable;
 
   virtual Status Resume(const ResumeActionList &resume_actions) = 0;
diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h
index 69fbe544c73cd..8b6317c6f33c2 100644
--- a/lldb/include/lldb/Symbol/SymbolContext.h
+++ b/lldb/include/lldb/Symbol/SymbolContext.h
@@ -467,7 +467,7 @@ class SymbolContextList {
   const_iterator begin() const { return m_symbol_contexts.begin(); }
   const_iterator end() const { return m_symbol_contexts.end(); }
 
-  typedef AdaptedIterable<collection, SymbolContext, vector_adapter>
+  typedef llvm::iterator_range<collection::const_iterator>
       SymbolContextIterable;
   SymbolContextIterable SymbolContexts() {
     return SymbolContextIterable(m_symbol_contexts);
diff --git a/lldb/include/lldb/Symbol/TypeList.h b/lldb/include/lldb/Symbol/TypeList.h
index d58772ad5b62e..6a38babd942ab 100644
--- a/lldb/include/lldb/Symbol/TypeList.h
+++ b/lldb/include/lldb/Symbol/TypeList.h
@@ -39,8 +39,7 @@ class TypeList {
   lldb::TypeSP GetTypeAtIndex(uint32_t idx);
 
   typedef std::vector<lldb::TypeSP> collection;
-  typedef AdaptedIterable<collection, lldb::TypeSP, vector_adapter>
-      TypeIterable;
+  typedef llvm::iterator_range<collection::const_iterator> TypeIterable;
 
   TypeIterable Types() { return TypeIterable(m_types); }
 
diff --git a/lldb/include/lldb/Symbol/TypeMap.h b/lldb/include/lldb/Symbol/TypeMap.h
index 89011efab5c31..6c36ff9369fa5 100644
--- a/lldb/include/lldb/Symbol/TypeMap.h
+++ b/lldb/include/lldb/Symbol/TypeMap.h
@@ -44,7 +44,8 @@ class TypeMap {
   lldb::TypeSP FirstType() const;
 
   typedef std::multimap<lldb::user_id_t, lldb::TypeSP> collection;
-  typedef AdaptedIterable<collection, lldb::TypeSP, map_adapter> TypeIterable;
+  typedef llvm::iterator_range<ValueMapIterator<collection::const_iterator>>
+      TypeIterable;
 
   TypeIterable Types() const { return TypeIterable(m_types); }
 
diff --git a/lldb/include/lldb/Target/QueueList.h b/lldb/include/lldb/Target/QueueList.h
index 7c74a6a99ac18..3f177c90d3989 100644
--- a/lldb/include/lldb/Target/QueueList.h
+++ b/lldb/include/lldb/Target/QueueList.h
@@ -48,9 +48,7 @@ class QueueList {
   lldb::QueueSP GetQueueAtIndex(uint32_t idx);
 
   typedef std::vector<lldb::QueueSP> collection;
-  typedef LockingAdaptedIterable<collection, lldb::QueueSP, vector_adapter,
-                                 std::mutex>
-      QueueIterable;
+  typedef LockingAdaptedIterable<std::mutex, collection> QueueIterable;
 
   /// Iterate over the list of queues
   ///
diff --git a/lldb/include/lldb/Target/TargetList.h b/lldb/include/lldb/Target/TargetList.h
index a0cddc6b2966f..080a6039c7ff8 100644
--- a/lldb/include/lldb/Target/TargetList.h
+++ b/lldb/include/lldb/Target/TargetList.h
@@ -44,8 +44,7 @@ class TargetList : public Broadcaster {
   }
 
   typedef std::vector<lldb::TargetSP> collection;
-  typedef LockingAdaptedIterable<collection, lldb::TargetSP, vector_adapter,
-                                 std::recursive_mutex>
+  typedef LockingAdaptedIterable<std::recursive_mutex, collection>
       TargetIterable;
 
   /// Create a new Target.
diff --git a/lldb/include/lldb/Target/ThreadCollection.h b/lldb/include/lldb/Target/ThreadCollection.h
index 29f5103e7eec7..3fe62787649f4 100644
--- a/lldb/include/lldb/Target/ThreadCollection.h
+++ b/lldb/include/lldb/Target/ThreadCollection.h
@@ -20,8 +20,7 @@ namespace lldb_private {
 class ThreadCollection {
 public:
   typedef std::vector<lldb::ThreadSP> collection;
-  typedef LockingAdaptedIterable<collection, lldb::ThreadSP, vector_adapter,
-                                 std::recursive_mutex>
+  typedef LockingAdaptedIterable<std::recursive_mutex, collection>
       ThreadIterable;
 
   ThreadCollection();
diff --git a/lldb/include/lldb/Utility/Iterable.h b/lldb/include/lldb/Utility/Iterable.h
index 5c38e46feb925..db1f0e65ef6f1 100644
--- a/lldb/include/lldb/Utility/Iterable.h
+++ b/lldb/include/lldb/Utility/Iterable.h
@@ -11,172 +11,37 @@
 
 #include <utility>
 
+#include <llvm/ADT/iterator.h>
 
 namespace lldb_private {
 
-template <typename I, typename E> E map_adapter(I &iter) {
-  return iter->second;
-}
-
-template <typename I, typename E> E vector_adapter(I &iter) { return *iter; }
-
-template <typename I, typename E> E list_adapter(I &iter) { return *iter; }
-
-template <typename C, typename E, E (*A)(typename C::const_iterator &)>
-class AdaptedConstIterator {
-public:
-  typedef typename C::const_iterator BackingIterator;
-
-  // Wrapping constructor
-  AdaptedConstIterator(BackingIterator backing_iterator)
-      : m_iter(backing_iterator) {}
-
-  // Default-constructible
-  AdaptedConstIterator() : m_iter() {}
-
-  // Copy-constructible
-  AdaptedConstIterator(const AdaptedConstIterator &rhs) : m_iter(rhs.m_iter) {}
-
-  // Copy-assignable
-  AdaptedConstIterator &operator=(const AdaptedConstIterator &rhs) {
-    m_iter = rhs.m_iter;
-    return *this;
-  }
-
-  // Destructible
-  ~AdaptedConstIterator() = default;
-
-  // Comparable
-  bool operator==(const AdaptedConstIterator &rhs) {
-    return m_iter == rhs.m_iter;
-  }
-
-  bool operator!=(const AdaptedConstIterator &rhs) {
-    return m_iter != rhs.m_iter;
-  }
-
-  // Rvalue dereferenceable
-  E operator*() { return (*A)(m_iter); }
-
-  E operator->() { return (*A)(m_iter); }
-
-  // Offset dereferenceable
-  E operator[](typename BackingIterator::difference_type offset) {
-    return AdaptedConstIterator(m_iter + offset);
-  }
-
-  // Incrementable
-  AdaptedConstIterator &operator++() {
-    m_iter++;
-    return *this;
-  }
-
-  // Decrementable
-  AdaptedConstIterator &operator--() {
-    m_iter--;
-    return *this;
-  }
-
-  // Compound assignment
-  AdaptedConstIterator &
-  operator+=(typename BackingIterator::difference_type offset) {
-    m_iter += offset;
-    return *this;
-  }
-
-  AdaptedConstIterator &
-  operator-=(typename BackingIterator::difference_type offset) {
-    m_iter -= offset;
-    return *this;
-  }
-
-  // Arithmetic
-  AdaptedConstIterator
-  operator+(typename BackingIterator::difference_type offset) {
-    return AdaptedConstIterator(m_iter + offset);
-  }
-
-  AdaptedConstIterator
-  operator-(typename BackingIterator::difference_type offset) {
-    return AdaptedConstIterator(m_iter - offset);
-  }
-
-  // Comparable
-  bool operator<(AdaptedConstIterator &rhs) { return m_iter < rhs.m_iter; }
-
-  bool operator<=(AdaptedConstIterator &rhs) { return m_iter <= rhs.m_iter; }
-
-  bool operator>(AdaptedConstIterator &rhs) { return m_iter > rhs.m_iter; }
-
-  bool operator>=(AdaptedConstIterator &rhs) { return m_iter >= rhs.m_iter; }
-
-  template <typename C1, typename E1, E1 (*A1)(typename C1::const_iterator &)>
-  friend AdaptedConstIterator<C1, E1, A1>
-  operator+(typename C1::const_iterator::difference_type,
-            AdaptedConstIterator<C1, E1, A1> &);
-
-  template <typename C1, typename E1, E1 (*A1)(typename C1::const_iterator &)>
-  friend typename C1::const_iterator::difference_type
-  operator-(AdaptedConstIterator<C1, E1, A1> &,
-            AdaptedConstIterator<C1, E1, A1> &);
-
-  template <typename C1, typename E1, E1 (*A1)(typename C1::const_iterator &)>
-  friend void swap(AdaptedConstIterator<C1, E1, A1> &,
-                   AdaptedConstIterator<C1, E1, A1> &);
-
-private:
-  BackingIterator m_iter;
-};
-
-template <typename C, typename E, E (*A)(typename C::const_iterator &)>
-AdaptedConstIterator<C, E, A> operator+(
-    typename AdaptedConstIterator<C, E, A>::BackingIterator::difference_type
-        offset,
-    AdaptedConstIterator<C, E, A> &rhs) {
-  return rhs.operator+(offset);
-}
-
-template <typename C, typename E, E (*A)(typename C::const_iterator &)>
-typename AdaptedConstIterator<C, E, A>::BackingIterator::difference_type
-operator-(AdaptedConstIterator<C, E, A> &lhs,
-          AdaptedConstIterator<C, E, A> &rhs) {
-  return (lhs.m_iter - rhs.m_iter);
-}
-
-template <typename C, typename E, E (*A)(typename C::const_iterator &)>
-void swap(AdaptedConstIterator<C, E, A> &lhs,
-          AdaptedConstIterator<C, E, A> &rhs) {
-  std::swap(lhs.m_iter, rhs.m_iter);
-}
-
-template <typename C, typename E, E (*A)(typename C::const_iterator &)>
-class AdaptedIterable {
-private:
-  const C &m_container;
-
-public:
-  AdaptedIterable(const C &container) : m_container(container) {}
-
-  AdaptedConstIterator<C, E, A> begin() {
-    return AdaptedConstIterator<C, E, A>(m_container.begin());
-  }
-
-  AdaptedConstIterator<C, E, A> end() {
-    return AdaptedConstIterator<C, E, A>(m_container.end());
-  }
+template <typename WrappedIteratorT,
+          typename T = typename std::iterator_traits<
+              WrappedIteratorT>::value_type::second_type>
+struct ValueMapIterator
+    : llvm::iterator_adaptor_base<
+          ValueMapIterator<WrappedIteratorT, T>, WrappedIteratorT,
+          typename std::iterator_traits<WrappedIteratorT>::iterator_category,
+          T> {
+  ValueMapIterator() = default;
+  explicit ValueMapIterator(WrappedIteratorT u)
+      : ValueMapIterator::iterator_adaptor_base(std::move(u)) {}
+
+  const T &operator*() { return (*this->I).second; }
+  const T &operator*() const { return (*this->I).second; }
 };
 
-template <typename C, typename E, E (*A)(typename C::const_iterator &),
-          typename MutexType>
-class LockingAdaptedIterable : public AdaptedIterable<C, E, A> {
+template <typename MutexType, typename C,
+          typename IteratorT = typename C::const_iterator>
+class LockingAdaptedIterable : public llvm::iterator_range<IteratorT> {
 public:
   LockingAdaptedIterable(const C &container, MutexType &mutex)
-      : AdaptedIterable<C, E, A>(container), m_mutex(&mutex) {
+      : llvm::iterator_range<IteratorT>(container), m_mutex(&mutex) {
     m_mutex->lock();
   }
 
   LockingAdaptedIterable(LockingAdaptedIterable &&rhs)
-      : AdaptedIterable<C, E, A>(rhs), m_mutex(rhs.m_mutex) {
+      : llvm::iterator_range<IteratorT>(rhs), m_mutex(rhs.m_mutex) {
     rhs.m_mutex = nullptr;
   }
 
diff --git a/lldb/source/Core/Telemetry.cpp b/lldb/source/Core/Telemetry.cpp
index 0d0d7c1df3bb9..5222f76704f91 100644
--- a/lldb/source/Core/Telemetry.cpp
+++ b/lldb/source/Core/Telemetry.cpp
@@ -29,10 +29,7 @@
 namespace lldb_private {
 namespace telemetry {
 
-using ::llvm::Error;
-using ::llvm::telemetry::Destination;
-using ::llvm::telemetry::Serializer;
-using ::llvm::telemetry::TelemetryInfo;
+using namespace llvm::telemetry;
 
 static uint64_t ToNanosec(const SteadyTimePoint Point) {
   return std::chrono::nanoseconds(Point.time_since_epoch()).count();
@@ -46,28 +43,34 @@ void LLDBBaseTelemetryInfo::serialize(Serializer &serializer) const {
     serializer.write("end_time", ToNanosec(end_time.value()));
 }
 
-[[maybe_unused]] static std::string MakeUUID(lldb_private::Debugger *debugger) {
+[[maybe_unused]] static std::string MakeUUID(Debugger *debugger) {
   uint8_t random_bytes[16];
   if (auto ec = llvm::getRandomBytes(random_bytes, 16)) {
     LLDB_LOG(GetLog(LLDBLog::Object),
              "Failed to generate random bytes for UUID: {0}", ec.message());
-    // fallback to using timestamp + debugger ID.
+    // Fallback to using timestamp + debugger ID.
     return llvm::formatv(
         "{0}_{1}", std::chrono::steady_clock::now().time_since_epoch().count(),
         debugger->GetID());
   }
-  return lldb_private::UUID(random_bytes).GetAsString();
+  return UUID(random_bytes).GetAsString();
 }
 
-TelemetryManager::TelemetryManager(
-    std::unique_ptr<llvm::telemetry::Config> config)
+TelemetryManager::TelemetryManager(std::unique_ptr<Config> config)
     : m_config(std::move(config)) {}
 
 llvm::Error TelemetryManager::preDispatch(TelemetryInfo *entry) {
   // Do nothing for now.
   // In up-coming patch, this would be where the manager
   // attach the session_uuid to the entry.
-  return Error::success();
+  return llvm::Error::success();
+}
+
+std::unique_ptr<TelemetryManager> TelemetryManager::g_instance = nullptr;
+TelemetryManager *TelemetryManager::getInstance() { return g_instance.get(); }
+
+void TelemetryManager::setInstance(std::unique_ptr<TelemetryManager> manager) {
+  g_instance = std::move(manager);
 }
 
 } // namespace telemetry
diff --git a/lldb/unittests/Core/CMakeLists.txt b/lldb/unittests/Core/CMakeLists.txt
index 949963fd40346..d4d3764b67ae3 100644
--- a/lldb/unittests/Core/CMakeLists.txt
+++ b/lldb/unittests/Core/CMakeLists.txt
@@ -1,3 +1,7 @@
+if (LLVM_BUILD_TELEMETRY)
+  set(TELEMETRY_DEPS Telemetry)
+endif()
+
 add_lldb_unittest(LLDBCoreTests
   CommunicationTest.cpp
   DiagnosticEventTest.cpp
@@ -10,6 +14,7 @@ add_lldb_unittest(LLDBCoreTests
   RichManglingContextTest.cpp
   SourceLocationSpecTest.cpp
   SourceManagerTest.cpp
+  TelemetryTest.cpp
   UniqueCStringMapTest.cpp
 
   LINK_LIBS
@@ -26,4 +31,5 @@ add_lldb_unittest(LLDBCoreTests
     LLVMTestingSupport
   LINK_COMPONENTS
     Support
+    ${TELEMETRY_DEPS}
   )
diff --git a/lldb/unittests/Core/TelemetryTest.cpp b/lldb/unittests/Core/TelemetryTest.cpp
new file mode 100644
index 0000000000000..0f2eaccb21a2c
--- /dev/null
+++ b/lldb/unittests/Core/TelemetryTest.cpp
@@ -0,0 +1,98 @@
+//===-- TelemetryTest.cpp ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Config/llvm-config.h"
+
+#ifdef LLVM_BUILD_TELEMETRY
+
+#include "lldb/Core/PluginInterface.h"
+#include "lldb/Core/PluginManager.h"
+#include "lldb/Core/Telemetry.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Telemetry/Telemetry.h"
+#include "llvm/Testing/Support/Error.h"
+#include "gtest/gtest.h"
+#include <memory>
+#include <vector>
+
+namespace lldb_private {
+
+struct FakeTelemetryInfo : public llvm::telemetry::TelemetryInfo {
+  std::string msg;
+};
+
+class TestDestination : public llvm::telemetry::Destination {
+public:
+  TestDestination(std::vector<const llvm::telemetry::TelemetryInfo *> *entries)
+      : received_entries(entries) {}
+
+  llvm::Error
+  receiveEntry(const llvm::telemetry::TelemetryInfo *entry) override {
+    received_entries->push_back(entry);
+    return llvm::Error::success();
+  }
+
+  llvm::StringLiteral name() const override { return "TestDestination"; }
+
+private:
+  std::vector<const llvm::telemetry::TelemetryInfo *> *received_entries;
+};
+
+class FakePlugin : public telemetry::TelemetryManager {
+public:
+  FakePlugin()
+      : telemetry::TelemetryManager(
+            std::make_unique<llvm::telemetry::Config>(true)) {}
+
+  // TelemetryManager interface
+  llvm::Error preDispatch(llvm::telemetry::TelemetryInfo *entry) override {
+    if (auto *fake_entry = llvm::dyn_cast<FakeTelemetryInfo>(entry))
+      fake_entry->msg = "In FakePlugin";
+
+    return llvm::Error::success();
+  }
+
+  llvm::StringRef GetInstanceName() const override {
+    return "FakeTelemetryPlugin";
+  }
+
+  static void Initialize() {
+    telemetry::TelemetryManager::setInstance(std::make_unique<FakePlugin>());
+  }
+
+  static void Terminate() { telemetry::TelemetryManager::setInstance(nullptr); }
+};
+
+} // namespace lldb_private
+
+TEST(TelemetryTest, PluginTest) {
+  // This would have been called by the plugin reg in a "real" plugin
+  // For tests, we just call it directly.
+  lldb_private::FakePlugin::Initialize();
+
+  auto *ins = lldb_private::telemetry::TelemetryManager::getInstance();
+  ASSERT_NE(ins, nullptr);
+
+  std::vector<const ::llvm::telemetry::TelemetryInfo *> expected_entries;
+  ins->addDestination(
+      std::make_unique<lldb_private::TestDestination>(&expected_entries));
+
+  lldb_private::FakeTelemetryInfo entry;
+  entry.msg = "";
+
+  ASSERT_THAT_ERROR(ins->dispatch(&entry), ::llvm::Succeeded());
+  ASSERT_EQ(1U, expected_entries.size());
+  EXPECT_EQ("In FakePlugin",
+            llvm::dyn_cast<lldb_private::FakeTelemetryInfo>(expected_entries[0])
+                ->msg);
+
+  ASSERT_EQ("FakeTelemetryPlugin", ins->GetInstanceName());
+}
+
+#endif // LLVM_BUILD_TELEMETRY
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 899b2cf3b4901..5966d1617feee 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -990,7 +990,12 @@ supported for the ``amdgcn`` target.
   the stride must be 0, the "add tid" flag must be 0, the swizzle enable bits
   must be off, and the extent must be measured in bytes. (On subtargets where
   bounds checking may be disabled, buffer fat pointers may choose to enable
-  it or not).
+  it or not). The cache swizzle support introduced in gfx942 may be used.
+
+  These pointers can be created by `addrspacecast` from a buffer resource
+  (`ptr addrspace(8)`) or by using `llvm.amdgcn.make.buffer.rsrc` to produce a
+  `ptr addrspace(7)` directly, which produces a buffer fat pointer with an initial
+  offset of 0 and prevents the address space cast from being rewritten away.
 
 **Buffer Resource**
   The buffer resource pointer, in address space 8, is the newer form
@@ -1027,6 +1032,12 @@ supported for the ``amdgcn`` target.
   the stride is the size of a structured element, the "add tid" flag must be 0,
   and the swizzle enable bits must be off.
 
+  These pointers can be created by `addrspacecast` from a buffer resource
+  (`ptr addrspace(8)`) or by using `llvm.amdgcn.make.buffer.rsrc` to produce a
+  `ptr addrspace(9)` directly, which produces a buffer strided pointer whose initial
+  index and offset values are both 0. This prevents the address space cast from
+  being rewritten away.
+
 **Streamout Registers**
   Dedicated registers used by the GS NGG Streamout Instructions. The register
   file is modelled as a memory in a distinct address space because it is indexed
diff --git a/llvm/docs/OptBisect.rst b/llvm/docs/OptBisect.rst
index 809f54883e5a9..0e4d31acbe71e 100644
--- a/llvm/docs/OptBisect.rst
+++ b/llvm/docs/OptBisect.rst
@@ -157,7 +157,6 @@ to make this check uniform across all passes.  These helper functions are:
 .. code-block:: c++
 
   bool ModulePass::skipModule(Module &M);
-  bool CallGraphSCCPass::skipSCC(CallGraphSCC &SCC);
   bool FunctionPass::skipFunction(const Function &F);
   bool LoopPass::skipLoop(const Loop *L);
 
diff --git a/llvm/include/llvm/Analysis/CallGraph.h b/llvm/include/llvm/Analysis/CallGraph.h
index f5ce2322b76d9..7f977db161c20 100644
--- a/llvm/include/llvm/Analysis/CallGraph.h
+++ b/llvm/include/llvm/Analysis/CallGraph.h
@@ -129,10 +129,6 @@ class CallGraph {
     return CallsExternalNode.get();
   }
 
-  /// Old node has been deleted, and New is to be used in its place, update the
-  /// ExternalCallingNode.
-  void ReplaceExternalCallEdge(CallGraphNode *Old, CallGraphNode *New);
-
   //===---------------------------------------------------------------------
   // Functions to keep a call graph up to date with a function that has been
   // modified.
@@ -251,18 +247,6 @@ class CallGraphNode {
     CalledFunctions.pop_back();
   }
 
-  /// Removes the edge in the node for the specified call site.
-  ///
-  /// Note that this method takes linear time, so it should be used sparingly.
-  void removeCallEdgeFor(CallBase &Call);
-
-  /// Removes all call edges from this node to the specified callee
-  /// function.
-  ///
-  /// This takes more time to execute than removeCallEdgeTo, so it should not
-  /// be used unless necessary.
-  void removeAnyCallEdgeTo(CallGraphNode *Callee);
-
   /// Removes one edge associated with a null callsite from this node to
   /// the specified callee function.
   void removeOneAbstractEdgeTo(CallGraphNode *Callee);
diff --git a/llvm/include/llvm/Analysis/CallGraphSCCPass.h b/llvm/include/llvm/Analysis/CallGraphSCCPass.h
index d0d81605436ea..e8714bae8f4d9 100644
--- a/llvm/include/llvm/Analysis/CallGraphSCCPass.h
+++ b/llvm/include/llvm/Analysis/CallGraphSCCPass.h
@@ -76,11 +76,6 @@ class CallGraphSCCPass : public Pass {
   /// the call graph.  If the derived class implements this method, it should
   /// always explicitly call the implementation here.
   void getAnalysisUsage(AnalysisUsage &Info) const override;
-
-protected:
-  /// Optional passes call this function to check whether the pass should be
-  /// skipped. This is the case when optimization bisect is over the limit.
-  bool skipSCC(CallGraphSCC &SCC) const;
 };
 
 /// CallGraphSCC - This is a single SCC that a CallGraphSCCPass is run on.
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index 66fd3fb9b0526..79f014edb58c8 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -1232,13 +1232,6 @@ class TargetRegisterInfo : public MCRegisterInfo {
     return nullptr;
   }
 
-  /// Returns the physical register number of sub-register "Index"
-  /// for physical register RegNo. Return zero if the sub-register does not
-  /// exist.
-  inline MCRegister getSubReg(MCRegister Reg, unsigned Idx) const {
-    return static_cast<const MCRegisterInfo *>(this)->getSubReg(Reg, Idx);
-  }
-
   /// Some targets have non-allocatable registers that aren't technically part
   /// of the explicit callee saved register list, but should be handled as such
   /// in certain cases.
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index d25077cae63e4..33b3d7bad4a71 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -2399,6 +2399,7 @@ class OpenMPIRBuilder {
                                    CurInfo.NonContigInfo.Strides.end());
     }
   };
+  using MapInfosOrErrorTy = Expected<MapInfosTy &>;
 
   /// Callback function type for functions emitting the host fallback code that
   /// is executed when the kernel launch fails. It takes an insertion point as
@@ -2407,6 +2408,11 @@ class OpenMPIRBuilder {
   using EmitFallbackCallbackTy =
       function_ref<InsertPointOrErrorTy(InsertPointTy)>;
 
+  // Callback function type for emitting and fetching user defined custom
+  // mappers.
+  using CustomMapperCallbackTy =
+      function_ref<Expected<Function *>(unsigned int)>;
+
   /// Generate a target region entry call and host fallback call.
   ///
   /// \param Loc The location at which the request originated and is fulfilled.
@@ -2473,11 +2479,11 @@ class OpenMPIRBuilder {
   /// return nullptr by reference. Accepts a reference to a MapInfosTy object
   /// that contains information generated for mappable clauses,
   /// including base pointers, pointers, sizes, map types, user-defined mappers.
-  void emitOffloadingArrays(
+  Error emitOffloadingArrays(
       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
-      TargetDataInfo &Info, bool IsNonContiguous = false,
-      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
-      function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
+      TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
+      bool IsNonContiguous = false,
+      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
 
   /// Allocates memory for and populates the arrays required for offloading
   /// (offload_{baseptrs|ptrs|mappers|sizes|maptypes|mapnames}). Then, it
@@ -2485,12 +2491,12 @@ class OpenMPIRBuilder {
   /// library. In essence, this function is a combination of
   /// emitOffloadingArrays and emitOffloadingArraysArgument and should arguably
   /// be preferred by clients of OpenMPIRBuilder.
-  void emitOffloadingArraysAndArgs(
+  Error emitOffloadingArraysAndArgs(
       InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
       TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
-      bool IsNonContiguous = false, bool ForEndCall = false,
-      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
-      function_ref<Value *(unsigned int)> CustomMapperCB = nullptr);
+      CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous = false,
+      bool ForEndCall = false,
+      function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr);
 
   /// Creates offloading entry for the provided entry ID \a ID, address \a
   /// Addr, size \a Size, and flags \a Flags.
@@ -2950,12 +2956,12 @@ class OpenMPIRBuilder {
   /// \param FuncName Optional param to specify mapper function name.
   /// \param CustomMapperCB Optional callback to generate code related to
   /// custom mappers.
-  Function *emitUserDefinedMapper(
-      function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
-                                llvm::Value *BeginArg)>
+  Expected<Function *> emitUserDefinedMapper(
+      function_ref<MapInfosOrErrorTy(
+          InsertPointTy CodeGenIP, llvm::Value *PtrPHI, llvm::Value *BeginArg)>
           PrivAndGenMapInfoCB,
       llvm::Type *ElemTy, StringRef FuncName,
-      function_ref<bool(unsigned int, Function **)> CustomMapperCB = nullptr);
+      CustomMapperCallbackTy CustomMapperCB);
 
   /// Generator for '#omp target data'
   ///
@@ -2969,21 +2975,21 @@ class OpenMPIRBuilder {
   /// \param IfCond Value which corresponds to the if clause condition.
   /// \param Info Stores all information realted to the Target Data directive.
   /// \param GenMapInfoCB Callback that populates the MapInfos and returns.
+  /// \param CustomMapperCB Callback to generate code related to
+  /// custom mappers.
   /// \param BodyGenCB Optional Callback to generate the region code.
   /// \param DeviceAddrCB Optional callback to generate code related to
   /// use_device_ptr and use_device_addr.
-  /// \param CustomMapperCB Optional callback to generate code related to
-  /// custom mappers.
   InsertPointOrErrorTy createTargetData(
       const LocationDescription &Loc, InsertPointTy AllocaIP,
       InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
       TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
+      CustomMapperCallbackTy CustomMapperCB,
       omp::RuntimeFunction *MapperFunc = nullptr,
       function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
                                         BodyGenTy BodyGenType)>
           BodyGenCB = nullptr,
       function_ref<void(unsigned int, Value *)> DeviceAddrCB = nullptr,
-      function_ref<Value *(unsigned int)> CustomMapperCB = nullptr,
       Value *SrcLocInfo = nullptr);
 
   using TargetBodyGenCallbackTy = function_ref<InsertPointOrErrorTy(
@@ -2999,6 +3005,7 @@ class OpenMPIRBuilder {
   /// \param IsOffloadEntry whether it is an offload entry.
   /// \param CodeGenIP The insertion point where the call to the outlined
   ///        function should be emitted.
+  /// \param Info Stores all information realted to the Target directive.
   /// \param EntryInfo The entry information about the function.
   /// \param DefaultAttrs Structure containing the default attributes, including
   ///        numbers of threads and teams to launch the kernel with.
@@ -3010,6 +3017,8 @@ class OpenMPIRBuilder {
   /// \param BodyGenCB Callback that will generate the region code.
   /// \param ArgAccessorFuncCB Callback that will generate accessors
   ///        instructions for passed in target arguments where neccessary
+  /// \param CustomMapperCB Callback to generate code related to
+  /// custom mappers.
   /// \param Dependencies A vector of DependData objects that carry
   ///        dependency information as passed in the depend clause
   /// \param HasNowait Whether the target construct has a `nowait` clause or
@@ -3017,13 +3026,14 @@ class OpenMPIRBuilder {
   InsertPointOrErrorTy createTarget(
       const LocationDescription &Loc, bool IsOffloadEntry,
       OpenMPIRBuilder::InsertPointTy AllocaIP,
-      OpenMPIRBuilder::InsertPointTy CodeGenIP,
+      OpenMPIRBuilder::InsertPointTy CodeGenIP, TargetDataInfo &Info,
       TargetRegionEntryInfo &EntryInfo,
       const TargetKernelDefaultAttrs &DefaultAttrs,
       const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
       SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
       TargetBodyGenCallbackTy BodyGenCB,
       TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
+      CustomMapperCallbackTy CustomMapperCB,
       SmallVector<DependData> Dependencies = {}, bool HasNowait = false);
 
   /// Returns __kmpc_for_static_init_* runtime function for the specified
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d5d185ebc12f6..9558f2b9b74e0 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1284,11 +1284,24 @@ defset list<AMDGPUImageDimIntrinsic> AMDGPUImageDimAtomicIntrinsics = {
 // Data type for buffer resources (V#). Maybe, in the future, we can create a
 // similar one for textures (T#).
 def AMDGPUBufferRsrcTy : LLVMQualPointerType<8>;
+// Data type for buffer fat pointers, which are a buffer resource (V#) followed by
+// a 32-bit offset. These don't exist in hardware and are a compiler-internal
+// convenience.
+def AMDGPUBufferFatPointerTy : LLVMQualPointerType<7>;
 
 let TargetPrefix = "amdgcn" in {
 
+// Create a buffer resource wrapping `base` with the specified `stride`
+// `numrecords`, and `flags`. All of these values will need to be
+// wave-uniform when the buffer instructions are invoked, so non-uniform
+// inputs to this intrinsic will trigger waterfall loops.
+//
+// In addition to creating ptr addrspace(8), whe representation of buffer
+// resources, it can create the fat pointers ptr addrspace(7) and ptr addrspace(9),
+// which carry additional offset bits. When this intrinsic is used to create
+// these fat pointers, their offset and index fields (if applicable) are zero.
 def int_amdgcn_make_buffer_rsrc : DefaultAttrsIntrinsic <
-  [AMDGPUBufferRsrcTy],
+  [llvm_anyptr_ty],
   [llvm_anyptr_ty, // base
    llvm_i16_ty,    // stride (and swizzle control)
    llvm_i32_ty,    // NumRecords / extent
diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h
index 66eda34ea2edb..ecc6c2049ef8d 100644
--- a/llvm/include/llvm/SandboxIR/Region.h
+++ b/llvm/include/llvm/SandboxIR/Region.h
@@ -120,6 +120,10 @@ class Region {
   /// Set \p I as the \p Idx'th element in the auxiliary vector.
   /// NOTE: This is for internal use, it does not set the metadata.
   void setAux(unsigned Idx, Instruction *I);
+  /// Helper for dropping Aux metadata for \p I.
+  void dropAuxMetadata(Instruction *I);
+  /// Remove instruction \p I from Aux and drop metadata.
+  void removeFromAux(Instruction *I);
 
 public:
   Region(Context &Ctx, TargetTransformInfo &TTI);
diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
index c237e1ddd6b38..b13a94cd56f2e 100644
--- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h
+++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h
@@ -65,7 +65,7 @@ CPUModel getCPUModel(StringRef CPU);
 
 } // namespace RISCV
 
-namespace RISCVII {
+namespace RISCVVType {
 enum VLMUL : uint8_t {
   LMUL_1 = 0,
   LMUL_2,
@@ -82,9 +82,7 @@ enum {
   TAIL_AGNOSTIC = 1,
   MASK_AGNOSTIC = 2,
 };
-} // namespace RISCVII
 
-namespace RISCVVType {
 // Is this a SEW value that can be encoded into the VTYPE format.
 inline static bool isValidSEW(unsigned SEW) {
   return isPowerOf2_32(SEW) && SEW >= 8 && SEW <= 64;
@@ -95,21 +93,21 @@ inline static bool isValidLMUL(unsigned LMUL, bool Fractional) {
   return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1);
 }
 
-unsigned encodeVTYPE(RISCVII::VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
+unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
                      bool MaskAgnostic);
 
-inline static RISCVII::VLMUL getVLMUL(unsigned VType) {
-  unsigned VLMUL = VType & 0x7;
-  return static_cast<RISCVII::VLMUL>(VLMUL);
+inline static VLMUL getVLMUL(unsigned VType) {
+  unsigned VLMul = VType & 0x7;
+  return static_cast<VLMUL>(VLMul);
 }
 
 // Decode VLMUL into 1,2,4,8 and fractional indicator.
-std::pair<unsigned, bool> decodeVLMUL(RISCVII::VLMUL VLMUL);
+std::pair<unsigned, bool> decodeVLMUL(VLMUL VLMul);
 
-inline static RISCVII::VLMUL encodeLMUL(unsigned LMUL, bool Fractional) {
+inline static VLMUL encodeLMUL(unsigned LMUL, bool Fractional) {
   assert(isValidLMUL(LMUL, Fractional) && "Unsupported LMUL");
   unsigned LmulLog2 = Log2_32(LMUL);
-  return static_cast<RISCVII::VLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
+  return static_cast<VLMUL>(Fractional ? 8 - LmulLog2 : LmulLog2);
 }
 
 inline static unsigned decodeVSEW(unsigned VSEW) {
@@ -133,10 +131,9 @@ inline static bool isMaskAgnostic(unsigned VType) { return VType & 0x80; }
 
 void printVType(unsigned VType, raw_ostream &OS);
 
-unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul);
+unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul);
 
-std::optional<RISCVII::VLMUL>
-getSameRatioLMUL(unsigned SEW, RISCVII::VLMUL VLMUL, unsigned EEW);
+std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMUL, unsigned EEW);
 } // namespace RISCVVType
 
 } // namespace llvm
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index 147a86de4e34e..daf6499213d48 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -16,14 +16,23 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/SandboxIR/Constant.h"
 #include "llvm/SandboxIR/Pass.h"
-#include "llvm/SandboxIR/PassManager.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
 
 namespace llvm::sandboxir {
 
-class BottomUpVec final : public FunctionPass {
+/// This is a simple bottom-up vectorizer Region pass.
+/// It expects a "seed slice" as an input in the Region's Aux vector.
+/// The "seed slice" is a vector of instructions that can be used as a starting
+/// point for vectorization, like stores to consecutive memory addresses.
+/// Starting from the seed instructions, it walks up the def-use chain looking
+/// for more instructions that can be vectorized. This pass will generate vector
+/// code if it can legally vectorize the code, regardless of whether it is
+/// profitable or not. For now profitability is checked at the end of the region
+/// pass pipeline by a dedicated pass that accepts or rejects the IR
+/// transaction, depending on the cost.
+class BottomUpVec final : public RegionPass {
   bool Change = false;
   std::unique_ptr<LegalityAnalysis> Legality;
   /// The original instructions that are potentially dead after vectorization.
@@ -55,16 +64,9 @@ class BottomUpVec final : public FunctionPass {
   /// Entry point for vectorization starting from \p Seeds.
   bool tryVectorize(ArrayRef<Value *> Seeds);
 
-  /// The PM containing the pipeline of region passes.
-  RegionPassManager RPM;
-
 public:
-  BottomUpVec(StringRef Pipeline);
-  bool runOnFunction(Function &F, const Analyses &A) final;
-  void printPipeline(raw_ostream &OS) const final {
-    OS << getName() << "\n";
-    RPM.printPipeline(OS);
-  }
+  BottomUpVec() : RegionPass("bottom-up-vec") {}
+  bool runOnRegion(Region &Rgn, const Analyses &A) final;
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.h
new file mode 100644
index 0000000000000..286d971ff4851
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.h
@@ -0,0 +1,40 @@
+//===- SeedCollection.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// The seed-collection pass of the bottom-up vectorizer.
+//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_SEEDCOLLECTION_H
+#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_SEEDCOLLECTION_H
+
+#include "llvm/SandboxIR/Pass.h"
+#include "llvm/SandboxIR/PassManager.h"
+
+namespace llvm::sandboxir {
+
+/// This pass collects the instructions that can become vectorization "seeds",
+/// like stores to consecutive memory addresses. It then goes over the collected
+/// seeds, slicing them into appropriately sized chunks, creating a Region with
+/// the seed slice as the Auxiliary vector and runs the region pass pipeline.
+class SeedCollection final : public FunctionPass {
+
+  /// The PM containing the pipeline of region passes.
+  RegionPassManager RPM;
+
+public:
+  SeedCollection(StringRef Pipeline);
+  bool runOnFunction(Function &F, const Analyses &A) final;
+  void printPipeline(raw_ostream &OS) const final {
+    OS << getName() << "\n";
+    RPM.printPipeline(OS);
+  }
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_SEEDCOLLECTION_H
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index ed9cd84bb8e9b..01344ee154fa5 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -138,16 +138,6 @@ void CallGraph::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void CallGraph::dump() const { print(dbgs()); }
 #endif
 
-void CallGraph::ReplaceExternalCallEdge(CallGraphNode *Old,
-                                        CallGraphNode *New) {
-  for (auto &CR : ExternalCallingNode->CalledFunctions)
-    if (CR.second == Old) {
-      CR.second->DropRef();
-      CR.second = New;
-      CR.second->AddRef();
-    }
-}
-
 // removeFunctionFromModule - Unlink the function from this module, returning
 // it.  Because this removes the function from the module, the call graph node
 // is destroyed.  This is only valid if the function does not call any other
@@ -203,39 +193,6 @@ void CallGraphNode::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void CallGraphNode::dump() const { print(dbgs()); }
 #endif
 
-/// removeCallEdgeFor - This method removes the edge in the node for the
-/// specified call site.  Note that this method takes linear time, so it
-/// should be used sparingly.
-void CallGraphNode::removeCallEdgeFor(CallBase &Call) {
-  for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
-    assert(I != CalledFunctions.end() && "Cannot find callsite to remove!");
-    if (I->first && *I->first == &Call) {
-      I->second->DropRef();
-      *I = CalledFunctions.back();
-      CalledFunctions.pop_back();
-
-      // Remove all references to callback functions if there are any.
-      forEachCallbackFunction(Call, [=](Function *CB) {
-        removeOneAbstractEdgeTo(CG->getOrInsertFunction(CB));
-      });
-      return;
-    }
-  }
-}
-
-// removeAnyCallEdgeTo - This method removes any call edges from this node to
-// the specified callee function.  This takes more time to execute than
-// removeCallEdgeTo, so it should not be used unless necessary.
-void CallGraphNode::removeAnyCallEdgeTo(CallGraphNode *Callee) {
-  for (unsigned i = 0, e = CalledFunctions.size(); i != e; ++i)
-    if (CalledFunctions[i].second == Callee) {
-      Callee->DropRef();
-      CalledFunctions[i] = CalledFunctions.back();
-      CalledFunctions.pop_back();
-      --i; --e;
-    }
-}
-
 /// removeOneAbstractEdgeTo - Remove one edge associated with a null callsite
 /// from this node to the specified callee function.
 void CallGraphNode::removeOneAbstractEdgeTo(CallGraphNode *Callee) {
diff --git a/llvm/lib/Analysis/CallGraphSCCPass.cpp b/llvm/lib/Analysis/CallGraphSCCPass.cpp
index 7caf814cdb2d7..441f0c5d2f34b 100644
--- a/llvm/lib/Analysis/CallGraphSCCPass.cpp
+++ b/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -725,28 +725,6 @@ Pass *CallGraphSCCPass::createPrinterPass(raw_ostream &OS,
   return new PrintCallGraphPass(Banner, OS);
 }
 
-static std::string getDescription(const CallGraphSCC &SCC) {
-  std::string Desc = "SCC (";
-  ListSeparator LS;
-  for (CallGraphNode *CGN : SCC) {
-    Desc += LS;
-    Function *F = CGN->getFunction();
-    if (F)
-      Desc += F->getName();
-    else
-      Desc += "<<null function>>";
-  }
-  Desc += ")";
-  return Desc;
-}
-
-bool CallGraphSCCPass::skipSCC(CallGraphSCC &SCC) const {
-  OptPassGate &Gate =
-      SCC.getCallGraph().getModule().getContext().getOptPassGate();
-  return Gate.isEnabled() &&
-         !Gate.shouldRunPass(this->getPassName(), getDescription(SCC));
-}
-
 char DummyCGSCCPass::ID = 0;
 
 INITIALIZE_PASS(DummyCGSCCPass, "DummyCGSCCPass", "DummyCGSCCPass", false,
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 8fa150f7d690e..c6b927c8eee2f 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -795,8 +795,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     // the given instruction was assessed.
     if (!PrintInstructionComments)
       return;
-    InstructionCostDetailMap[I].CostBefore = Cost;
-    InstructionCostDetailMap[I].ThresholdBefore = Threshold;
+    auto &CostDetail = InstructionCostDetailMap[I];
+    CostDetail.CostBefore = Cost;
+    CostDetail.ThresholdBefore = Threshold;
   }
 
   void onInstructionAnalysisFinish(const Instruction *I) override {
@@ -804,8 +805,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     // the instruction has been assessed.
     if (!PrintInstructionComments)
       return;
-    InstructionCostDetailMap[I].CostAfter = Cost;
-    InstructionCostDetailMap[I].ThresholdAfter = Threshold;
+    auto &CostDetail = InstructionCostDetailMap[I];
+    CostDetail.CostAfter = Cost;
+    CostDetail.ThresholdAfter = Threshold;
   }
 
   bool isCostBenefitAnalysisEnabled() {
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 91a5f194db9dc..e3e026f7979da 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -1984,7 +1984,7 @@ static void computeKnownBitsFromOperator(const Operator *I,
         const ConstantRange Range = getVScaleRange(II->getFunction(), BitWidth);
         uint64_t SEW = RISCVVType::decodeVSEW(
             cast<ConstantInt>(II->getArgOperand(HasAVL))->getZExtValue());
-        RISCVII::VLMUL VLMUL = static_cast<RISCVII::VLMUL>(
+        RISCVVType::VLMUL VLMUL = static_cast<RISCVVType::VLMUL>(
             cast<ConstantInt>(II->getArgOperand(1 + HasAVL))->getZExtValue());
         uint64_t MaxVLEN =
             Range.getUnsignedMax().getZExtValue() * RISCV::RVVBitsPerBlock;
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index c9efec37b0bc6..d87649c4e6567 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -362,8 +362,9 @@ static void clobberRegEntries(InlinedEntity Var, unsigned RegNo,
       FellowRegisters.push_back(Reg);
 
   // Drop all entries that have ended.
+  auto &Entries = LiveEntries[Var];
   for (auto Index : IndicesToErase)
-    LiveEntries[Var].erase(Index);
+    Entries.erase(Index);
 }
 
 /// Add a new debug value for \p Var. Closes all overlapping debug values.
diff --git a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
index 7f0a45941cf9b..9ac8c5ef66de6 100644
--- a/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ELFNixPlatform.cpp
@@ -469,11 +469,12 @@ void ELFNixPlatform::pushInitializersLoop(
       Worklist.pop_back();
 
       // If we've already visited this JITDylib on this iteration then continue.
-      if (JDDepMap.count(DepJD))
+      auto [It, Inserted] = JDDepMap.try_emplace(DepJD);
+      if (!Inserted)
         continue;
 
       // Add dep info.
-      auto &DM = JDDepMap[DepJD];
+      auto &DM = It->second;
       DepJD->withLinkOrderDo([&](const JITDylibSearchOrder &O) {
         for (auto &KV : O) {
           if (KV.first == DepJD)
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 7ba23b0bd377e..18bc82fc827f7 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -6555,12 +6555,11 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
     const LocationDescription &Loc, InsertPointTy AllocaIP,
     InsertPointTy CodeGenIP, Value *DeviceID, Value *IfCond,
     TargetDataInfo &Info, GenMapInfoCallbackTy GenMapInfoCB,
-    omp::RuntimeFunction *MapperFunc,
+    CustomMapperCallbackTy CustomMapperCB, omp::RuntimeFunction *MapperFunc,
     function_ref<InsertPointOrErrorTy(InsertPointTy CodeGenIP,
                                       BodyGenTy BodyGenType)>
         BodyGenCB,
-    function_ref<void(unsigned int, Value *)> DeviceAddrCB,
-    function_ref<Value *(unsigned int)> CustomMapperCB, Value *SrcLocInfo) {
+    function_ref<void(unsigned int, Value *)> DeviceAddrCB, Value *SrcLocInfo) {
   if (!updateToLocation(Loc))
     return InsertPointTy();
 
@@ -6585,9 +6584,10 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTargetData(
   auto BeginThenGen = [&](InsertPointTy AllocaIP,
                           InsertPointTy CodeGenIP) -> Error {
     MapInfo = &GenMapInfoCB(Builder.saveIP());
-    emitOffloadingArrays(AllocaIP, Builder.saveIP(), *MapInfo, Info,
-                         /*IsNonContiguous=*/true, DeviceAddrCB,
-                         CustomMapperCB);
+    if (Error Err = emitOffloadingArrays(
+            AllocaIP, Builder.saveIP(), *MapInfo, Info, CustomMapperCB,
+            /*IsNonContiguous=*/true, DeviceAddrCB))
+      return Err;
 
     TargetDataRTArgs RTArgs;
     emitOffloadingArraysArgument(Builder, RTArgs, Info);
@@ -7486,26 +7486,31 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::emitTargetTask(
   return Builder.saveIP();
 }
 
-void OpenMPIRBuilder::emitOffloadingArraysAndArgs(
+Error OpenMPIRBuilder::emitOffloadingArraysAndArgs(
     InsertPointTy AllocaIP, InsertPointTy CodeGenIP, TargetDataInfo &Info,
-    TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo, bool IsNonContiguous,
-    bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB,
-    function_ref<Value *(unsigned int)> CustomMapperCB) {
-  emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info, IsNonContiguous,
-                       DeviceAddrCB, CustomMapperCB);
+    TargetDataRTArgs &RTArgs, MapInfosTy &CombinedInfo,
+    CustomMapperCallbackTy CustomMapperCB, bool IsNonContiguous,
+    bool ForEndCall, function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
+  if (Error Err =
+          emitOffloadingArrays(AllocaIP, CodeGenIP, CombinedInfo, Info,
+                               CustomMapperCB, IsNonContiguous, DeviceAddrCB))
+    return Err;
   emitOffloadingArraysArgument(Builder, RTArgs, Info, ForEndCall);
+  return Error::success();
 }
 
 static void
 emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
                OpenMPIRBuilder::InsertPointTy AllocaIP,
+               OpenMPIRBuilder::TargetDataInfo &Info,
                const OpenMPIRBuilder::TargetKernelDefaultAttrs &DefaultAttrs,
                const OpenMPIRBuilder::TargetKernelRuntimeAttrs &RuntimeAttrs,
                Value *IfCond, Function *OutlinedFn, Constant *OutlinedFnID,
                SmallVectorImpl<Value *> &Args,
                OpenMPIRBuilder::GenMapInfoCallbackTy GenMapInfoCB,
-               SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies = {},
-               bool HasNoWait = false) {
+               OpenMPIRBuilder::CustomMapperCallbackTy CustomMapperCB,
+               SmallVector<llvm::OpenMPIRBuilder::DependData> Dependencies,
+               bool HasNoWait) {
   // Generate a function call to the host fallback implementation of the target
   // region. This is called by the host when no offload entry was generated for
   // the target region and when the offloading call fails at runtime.
@@ -7576,16 +7581,13 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
   auto &&EmitTargetCallThen =
       [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
           OpenMPIRBuilder::InsertPointTy CodeGenIP) -> Error {
-    OpenMPIRBuilder::TargetDataInfo Info(
-        /*RequiresDevicePointerInfo=*/false,
-        /*SeparateBeginEndCalls=*/true);
-
     OpenMPIRBuilder::MapInfosTy &MapInfo = GenMapInfoCB(Builder.saveIP());
     OpenMPIRBuilder::TargetDataRTArgs RTArgs;
-    OMPBuilder.emitOffloadingArraysAndArgs(AllocaIP, Builder.saveIP(), Info,
-                                           RTArgs, MapInfo,
-                                           /*IsNonContiguous=*/true,
-                                           /*ForEndCall=*/false);
+    if (Error Err = OMPBuilder.emitOffloadingArraysAndArgs(
+            AllocaIP, Builder.saveIP(), Info, RTArgs, MapInfo, CustomMapperCB,
+            /*IsNonContiguous=*/true,
+            /*ForEndCall=*/false))
+      return Err;
 
     SmallVector<Value *, 3> NumTeamsC;
     for (auto [DefaultVal, RuntimeVal] :
@@ -7687,13 +7689,15 @@ emitTargetCall(OpenMPIRBuilder &OMPBuilder, IRBuilderBase &Builder,
 
 OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
     const LocationDescription &Loc, bool IsOffloadEntry, InsertPointTy AllocaIP,
-    InsertPointTy CodeGenIP, TargetRegionEntryInfo &EntryInfo,
+    InsertPointTy CodeGenIP, TargetDataInfo &Info,
+    TargetRegionEntryInfo &EntryInfo,
     const TargetKernelDefaultAttrs &DefaultAttrs,
     const TargetKernelRuntimeAttrs &RuntimeAttrs, Value *IfCond,
-    SmallVectorImpl<Value *> &Args, GenMapInfoCallbackTy GenMapInfoCB,
+    SmallVectorImpl<Value *> &Inputs, GenMapInfoCallbackTy GenMapInfoCB,
     OpenMPIRBuilder::TargetBodyGenCallbackTy CBFunc,
     OpenMPIRBuilder::TargetGenArgAccessorsCallbackTy ArgAccessorFuncCB,
-    SmallVector<DependData> Dependencies, bool HasNowait) {
+    CustomMapperCallbackTy CustomMapperCB, SmallVector<DependData> Dependencies,
+    bool HasNowait) {
 
   if (!updateToLocation(Loc))
     return InsertPointTy();
@@ -7707,16 +7711,16 @@ OpenMPIRBuilder::InsertPointOrErrorTy OpenMPIRBuilder::createTarget(
   // and ArgAccessorFuncCB
   if (Error Err = emitTargetOutlinedFunction(
           *this, Builder, IsOffloadEntry, EntryInfo, DefaultAttrs, OutlinedFn,
-          OutlinedFnID, Args, CBFunc, ArgAccessorFuncCB))
+          OutlinedFnID, Inputs, CBFunc, ArgAccessorFuncCB))
     return Err;
 
   // If we are not on the target device, then we need to generate code
   // to make a remote call (offload) to the previously outlined function
   // that represents the target region. Do that now.
   if (!Config.isTargetDevice())
-    emitTargetCall(*this, Builder, AllocaIP, DefaultAttrs, RuntimeAttrs, IfCond,
-                   OutlinedFn, OutlinedFnID, Args, GenMapInfoCB, Dependencies,
-                   HasNowait);
+    emitTargetCall(*this, Builder, AllocaIP, Info, DefaultAttrs, RuntimeAttrs,
+                   IfCond, OutlinedFn, OutlinedFnID, Inputs, GenMapInfoCB,
+                   CustomMapperCB, Dependencies, HasNowait);
   return Builder.saveIP();
 }
 
@@ -8041,12 +8045,11 @@ void OpenMPIRBuilder::emitUDMapperArrayInitOrDel(
       OffloadingArgs);
 }
 
-Function *OpenMPIRBuilder::emitUserDefinedMapper(
-    function_ref<MapInfosTy &(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
-                              llvm::Value *BeginArg)>
+Expected<Function *> OpenMPIRBuilder::emitUserDefinedMapper(
+    function_ref<MapInfosOrErrorTy(InsertPointTy CodeGenIP, llvm::Value *PtrPHI,
+                                   llvm::Value *BeginArg)>
         GenMapInfoCB,
-    Type *ElemTy, StringRef FuncName,
-    function_ref<bool(unsigned int, Function **)> CustomMapperCB) {
+    Type *ElemTy, StringRef FuncName, CustomMapperCallbackTy CustomMapperCB) {
   SmallVector<Type *> Params;
   Params.emplace_back(Builder.getPtrTy());
   Params.emplace_back(Builder.getPtrTy());
@@ -8117,7 +8120,9 @@ Function *OpenMPIRBuilder::emitUserDefinedMapper(
   PtrPHI->addIncoming(PtrBegin, HeadBB);
 
   // Get map clause information. Fill up the arrays with all mapped variables.
-  MapInfosTy &Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
+  MapInfosOrErrorTy Info = GenMapInfoCB(Builder.saveIP(), PtrPHI, BeginIn);
+  if (!Info)
+    return Info.takeError();
 
   // Call the runtime API __tgt_mapper_num_components to get the number of
   // pre-existing components.
@@ -8129,20 +8134,20 @@ Function *OpenMPIRBuilder::emitUserDefinedMapper(
       Builder.CreateShl(PreviousSize, Builder.getInt64(getFlagMemberOffset()));
 
   // Fill up the runtime mapper handle for all components.
-  for (unsigned I = 0; I < Info.BasePointers.size(); ++I) {
+  for (unsigned I = 0; I < Info->BasePointers.size(); ++I) {
     Value *CurBaseArg =
-        Builder.CreateBitCast(Info.BasePointers[I], Builder.getPtrTy());
+        Builder.CreateBitCast(Info->BasePointers[I], Builder.getPtrTy());
     Value *CurBeginArg =
-        Builder.CreateBitCast(Info.Pointers[I], Builder.getPtrTy());
-    Value *CurSizeArg = Info.Sizes[I];
-    Value *CurNameArg = Info.Names.size()
-                            ? Info.Names[I]
+        Builder.CreateBitCast(Info->Pointers[I], Builder.getPtrTy());
+    Value *CurSizeArg = Info->Sizes[I];
+    Value *CurNameArg = Info->Names.size()
+                            ? Info->Names[I]
                             : Constant::getNullValue(Builder.getPtrTy());
 
     // Extract the MEMBER_OF field from the map type.
     Value *OriMapType = Builder.getInt64(
         static_cast<std::underlying_type_t<OpenMPOffloadMappingFlags>>(
-            Info.Types[I]));
+            Info->Types[I]));
     Value *MemberMapType =
         Builder.CreateNUWAdd(OriMapType, ShiftedPreviousSize);
 
@@ -8224,10 +8229,13 @@ Function *OpenMPIRBuilder::emitUserDefinedMapper(
 
     Value *OffloadingArgs[] = {MapperHandle, CurBaseArg, CurBeginArg,
                                CurSizeArg,   CurMapType, CurNameArg};
-    Function *ChildMapperFn = nullptr;
-    if (CustomMapperCB && CustomMapperCB(I, &ChildMapperFn)) {
+
+    auto ChildMapperFn = CustomMapperCB(I);
+    if (!ChildMapperFn)
+      return ChildMapperFn.takeError();
+    if (*ChildMapperFn) {
       // Call the corresponding mapper function.
-      Builder.CreateCall(ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
+      Builder.CreateCall(*ChildMapperFn, OffloadingArgs)->setDoesNotThrow();
     } else {
       // Call the runtime API __tgt_push_mapper_component to fill up the runtime
       // data structure.
@@ -8261,18 +8269,18 @@ Function *OpenMPIRBuilder::emitUserDefinedMapper(
   return MapperFn;
 }
 
-void OpenMPIRBuilder::emitOffloadingArrays(
+Error OpenMPIRBuilder::emitOffloadingArrays(
     InsertPointTy AllocaIP, InsertPointTy CodeGenIP, MapInfosTy &CombinedInfo,
-    TargetDataInfo &Info, bool IsNonContiguous,
-    function_ref<void(unsigned int, Value *)> DeviceAddrCB,
-    function_ref<Value *(unsigned int)> CustomMapperCB) {
+    TargetDataInfo &Info, CustomMapperCallbackTy CustomMapperCB,
+    bool IsNonContiguous,
+    function_ref<void(unsigned int, Value *)> DeviceAddrCB) {
 
   // Reset the array information.
   Info.clearArrayInfo();
   Info.NumberOfPtrs = CombinedInfo.BasePointers.size();
 
   if (Info.NumberOfPtrs == 0)
-    return;
+    return Error::success();
 
   Builder.restoreIP(AllocaIP);
   // Detect if we have any capture size requiring runtime evaluation of the
@@ -8436,9 +8444,13 @@ void OpenMPIRBuilder::emitOffloadingArrays(
     // Fill up the mapper array.
     unsigned IndexSize = M.getDataLayout().getIndexSizeInBits(0);
     Value *MFunc = ConstantPointerNull::get(PtrTy);
-    if (CustomMapperCB)
-      if (Value *CustomMFunc = CustomMapperCB(I))
-        MFunc = Builder.CreatePointerCast(CustomMFunc, PtrTy);
+
+    auto CustomMFunc = CustomMapperCB(I);
+    if (!CustomMFunc)
+      return CustomMFunc.takeError();
+    if (*CustomMFunc)
+      MFunc = Builder.CreatePointerCast(*CustomMFunc, PtrTy);
+
     Value *MAddr = Builder.CreateInBoundsGEP(
         MappersArray->getAllocatedType(), MappersArray,
         {Builder.getIntN(IndexSize, 0), Builder.getIntN(IndexSize, I)});
@@ -8448,8 +8460,9 @@ void OpenMPIRBuilder::emitOffloadingArrays(
 
   if (!IsNonContiguous || CombinedInfo.NonContigInfo.Offsets.empty() ||
       Info.NumberOfPtrs == 0)
-    return;
+    return Error::success();
   emitNonContiguousDescriptor(AllocaIP, CodeGenIP, CombinedInfo, Info);
+  return Error::success();
 }
 
 void OpenMPIRBuilder::emitBranch(BasicBlock *Target) {
diff --git a/llvm/lib/SandboxIR/Region.cpp b/llvm/lib/SandboxIR/Region.cpp
index ede738c35f032..086993e6dc872 100644
--- a/llvm/lib/SandboxIR/Region.cpp
+++ b/llvm/lib/SandboxIR/Region.cpp
@@ -40,8 +40,10 @@ Region::Region(Context &Ctx, TargetTransformInfo &TTI)
 
   CreateInstCB = Ctx.registerCreateInstrCallback(
       [this](Instruction *NewInst) { add(NewInst); });
-  EraseInstCB = Ctx.registerEraseInstrCallback(
-      [this](Instruction *ErasedInst) { remove(ErasedInst); });
+  EraseInstCB = Ctx.registerEraseInstrCallback([this](Instruction *ErasedInst) {
+    remove(ErasedInst);
+    removeFromAux(ErasedInst);
+  });
 }
 
 Region::~Region() {
@@ -84,11 +86,22 @@ void Region::setAux(unsigned Idx, Instruction *I) {
   Aux[Idx] = I;
 }
 
+void Region::dropAuxMetadata(Instruction *I) {
+  auto *LLVMI = cast<llvm::Instruction>(I->Val);
+  LLVMI->setMetadata(AuxMDKind, nullptr);
+}
+
+void Region::removeFromAux(Instruction *I) {
+  auto It = find(Aux, I);
+  if (It == Aux.end())
+    return;
+  dropAuxMetadata(I);
+  Aux.erase(It);
+}
+
 void Region::clearAux() {
-  for (unsigned Idx : seq<unsigned>(0, Aux.size())) {
-    auto *LLVMI = cast<llvm::Instruction>(Aux[Idx]->Val);
-    LLVMI->setMetadata(AuxMDKind, nullptr);
-  }
+  for (unsigned Idx : seq<unsigned>(0, Aux.size()))
+    dropAuxMetadata(Aux[Idx]);
   Aux.clear();
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index b0b6c4df8e982..86b2c4f78fc3e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -2078,6 +2078,7 @@ static bool isRemovablePointerIntrinsic(Intrinsic::ID IID) {
   switch (IID) {
   default:
     return false;
+  case Intrinsic::amdgcn_make_buffer_rsrc:
   case Intrinsic::ptrmask:
   case Intrinsic::invariant_start:
   case Intrinsic::invariant_end:
@@ -2092,6 +2093,25 @@ PtrParts SplitPtrStructs::visitIntrinsicInst(IntrinsicInst &I) {
   switch (IID) {
   default:
     break;
+  case Intrinsic::amdgcn_make_buffer_rsrc: {
+    if (!isSplitFatPtr(I.getType()))
+      return {nullptr, nullptr};
+    Value *Base = I.getArgOperand(0);
+    Value *Stride = I.getArgOperand(1);
+    Value *NumRecords = I.getArgOperand(2);
+    Value *Flags = I.getArgOperand(3);
+    auto *SplitType = cast<StructType>(I.getType());
+    Type *RsrcType = SplitType->getElementType(0);
+    Type *OffType = SplitType->getElementType(1);
+    IRB.SetInsertPoint(&I);
+    Value *Rsrc = IRB.CreateIntrinsic(IID, {RsrcType, Base->getType()},
+                                      {Base, Stride, NumRecords, Flags});
+    copyMetadata(Rsrc, &I);
+    Rsrc->takeName(&I);
+    Value *Zero = Constant::getNullValue(OffType);
+    SplitUsers.insert(&I);
+    return {Rsrc, Zero};
+  }
   case Intrinsic::ptrmask: {
     Value *Ptr = I.getArgOperand(0);
     if (!isSplitFatPtr(Ptr->getType()))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
index e70d6aab306fe..4df55eac5d76b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUWaitSGPRHazards.cpp
@@ -384,13 +384,14 @@ class AMDGPUWaitSGPRHazards {
       }
     }
 
-    bool Changed = State != BlockState[&MBB].Out;
+    BlockHazardState &BS = BlockState[&MBB];
+    bool Changed = State != BS.Out;
     if (Emit) {
       assert(!Changed && "Hazard state should not change on emit pass");
       return Emitted;
     }
     if (Changed)
-      BlockState[&MBB].Out = State;
+      BS.Out = State;
     return Changed;
   }
 
diff --git a/llvm/lib/Target/DirectX/DXILRootSignature.cpp b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
index 0fecbd698bc5f..fd390cdbf9057 100644
--- a/llvm/lib/Target/DirectX/DXILRootSignature.cpp
+++ b/llvm/lib/Target/DirectX/DXILRootSignature.cpp
@@ -150,11 +150,17 @@ analyzeModule(Module &M) {
       continue;
     }
 
-    MDNode *RootElementListNode =
-        dyn_cast<MDNode>(RSDefNode->getOperand(1).get());
+    Metadata *RootElementListOperand = RSDefNode->getOperand(1).get();
 
+    if (RootElementListOperand == nullptr) {
+      reportError(Ctx, "Root Element mdnode is null.");
+      continue;
+    }
+
+    MDNode *RootElementListNode = dyn_cast<MDNode>(RootElementListOperand);
     if (RootElementListNode == nullptr) {
-      reportError(Ctx, "Missing Root Element List Metadata node.");
+      reportError(Ctx, "Root Element is not a metadata node.");
+      continue;
     }
 
     mcdxbc::RootSignatureDesc RSD;
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 4e4467c76aff5..0760d712f9afd 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -1174,8 +1174,8 @@ auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
   for (const ByteSpan::Block &B : VSpan) {
     ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
     for (const ByteSpan::Block &S : ASection) {
-      EarliestUser[S.Seg.Val] = std::min(
-          EarliestUser[S.Seg.Val], earliestUser(B.Seg.Val->uses()), isEarlier);
+      auto &EU = EarliestUser[S.Seg.Val];
+      EU = std::min(EU, earliestUser(B.Seg.Val->uses()), isEarlier);
     }
   }
 
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
index d34f45fcac008..e42e738b9973f 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp
@@ -84,7 +84,7 @@ void NVPTXInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     printRegName(O, Reg);
   } else if (Op.isImm()) {
     markup(O, Markup::Immediate) << formatImm(Op.getImm());
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 7511e24f705c1..74404822757ed 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -492,9 +492,9 @@ void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
 
 void PPCInstPrinter::printcrbitm(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
-  unsigned CCReg = MI->getOperand(OpNo).getReg();
+  MCRegister CCReg = MI->getOperand(OpNo).getReg();
   unsigned RegNo;
-  switch (CCReg) {
+  switch (CCReg.id()) {
   default: llvm_unreachable("Unknown CR register");
   case PPC::CR0: RegNo = 0; break;
   case PPC::CR1: RegNo = 1; break;
@@ -648,7 +648,7 @@ void PPCInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI, raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
+    MCRegister Reg = Op.getReg();
     if (!ShowVSRNumsAsVR)
       Reg = PPC::getRegNumForOperand(MII.get(MI->getOpcode()), Reg, OpNo);
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index cef3b7c851c79..2539c6e49686a 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -488,7 +488,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
             MI.getOpcode() != PPC::MFOCRF && MI.getOpcode() != PPC::MFOCRF8) ||
            MO.getReg() < PPC::CR0 || MO.getReg() > PPC::CR7);
     unsigned OpNo = getOpIdxForMO(MI, MO);
-    unsigned Reg =
+    MCRegister Reg =
         PPC::getRegNumForOperand(MCII.get(MI.getOpcode()), MO.getReg(), OpNo);
     return CTX.getRegisterInfo()->getEncodingValue(Reg);
   }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 61be6abaacd2f..0a0facb10e48a 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -117,8 +117,8 @@ const char *PPC::stripRegisterPrefix(const char *RegName) {
 /// The operand number argument will be useful when we need to extend this
 /// to instructions that use both Altivec and VSX numbering (for different
 /// operands).
-unsigned PPC::getRegNumForOperand(const MCInstrDesc &Desc, unsigned Reg,
-                                  unsigned OpNo) {
+MCRegister PPC::getRegNumForOperand(const MCInstrDesc &Desc, MCRegister Reg,
+                                    unsigned OpNo) {
   int16_t regClass = Desc.operands()[OpNo].RegClass;
   switch (regClass) {
     // We store F0-F31, VF0-VF31 in MCOperand and it should be F0-F31,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 579ee5e8facb6..d6744014949ce 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -47,8 +47,8 @@ const char *stripRegisterPrefix(const char *RegName);
 /// The operand number argument will be useful when we need to extend this
 /// to instructions that use both Altivec and VSX numbering (for different
 /// operands).
-unsigned getRegNumForOperand(const MCInstrDesc &Desc, unsigned Reg,
-                             unsigned OpNo);
+MCRegister getRegNumForOperand(const MCInstrDesc &Desc, MCRegister Reg,
+                               unsigned OpNo);
 
 } // namespace PPC
 
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 5784fe43879fe..ae42da6ea6e42 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -984,7 +984,7 @@ void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
     // Get the offset from the GOT Base Register to the GOT
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
     if (Subtarget->isSecurePlt() && isPositionIndependent() ) {
-      unsigned PICR = TmpInst.getOperand(0).getReg();
+      MCRegister PICR = TmpInst.getOperand(0).getReg();
       MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol(
           M->getPICLevel() == PICLevel::SmallPIC ? "_GLOBAL_OFFSET_TABLE_"
                                                  : ".LTOC");
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index ba775c4a679d0..f88af657c8ad5 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -267,7 +267,7 @@ static bool hasNonRISpills(const MachineFunction &MF) {
 /// MustSaveLR - Return true if this function requires that we save the LR
 /// register onto the stack in the prolog and restore it in the epilog of the
 /// function.
-static bool MustSaveLR(const MachineFunction &MF, unsigned LR) {
+static bool MustSaveLR(const MachineFunction &MF, MCRegister LR) {
   const PPCFunctionInfo *MFI = MF.getInfo<PPCFunctionInfo>();
 
   // We need a save/restore of LR if there is any def of LR (which is
@@ -311,7 +311,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
 
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
-  unsigned LR = RegInfo->getRARegister();
+  MCRegister LR = RegInfo->getRARegister();
   bool DisableRedZone = MF.getFunction().hasFnAttribute(Attribute::NoRedZone);
   bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca.
                        !MFI.adjustsStack() &&       // No calls.
@@ -1987,7 +1987,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-  unsigned LR = RegInfo->getRARegister();
+  MCRegister LR = RegInfo->getRARegister();
   FI->setMustSaveLR(MustSaveLR(MF, LR));
   SavedRegs.reset(LR);
 
@@ -2344,8 +2344,8 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
 
     for (auto &CalleeSaveReg : CSI) {
       MCPhysReg Reg = CalleeSaveReg.getReg();
-      MCPhysReg Lower = RegInfo->getSubReg(Reg, 1);
-      MCPhysReg Higher = RegInfo->getSubReg(Reg, 2);
+      MCRegister Lower = RegInfo->getSubReg(Reg, PPC::sub_32);
+      MCRegister Higher = RegInfo->getSubReg(Reg, PPC::sub_32_hi_phony);
 
       if ( // Check only for SuperRegs.
           Lower &&
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index ac87d72b7595c..6d4466b7abf53 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2311,7 +2311,7 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
   }
 
   if (getLexer().is(AsmToken::EndOfStatement) && State == VTypeState_Done) {
-    RISCVII::VLMUL VLMUL = RISCVVType::encodeLMUL(Lmul, Fractional);
+    RISCVVType::VLMUL VLMUL = RISCVVType::encodeLMUL(Lmul, Fractional);
     if (Fractional) {
       unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32;
       unsigned MaxSEW = ELEN / Lmul;
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index 6f0645965d737..56b1639143d8b 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -1120,7 +1120,7 @@ bool RISCVLegalizerInfo::legalizeExtractSubvector(MachineInstr &MI,
   // divide exactly.
   assert(
       RISCVVType::decodeVLMUL(RISCVTargetLowering::getLMUL(LitTyMVT)).second ||
-      RISCVTargetLowering::getLMUL(LitTyMVT) == RISCVII::VLMUL::LMUL_1);
+      RISCVTargetLowering::getLMUL(LitTyMVT) == RISCVVType::LMUL_1);
 
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
@@ -1143,7 +1143,7 @@ bool RISCVLegalizerInfo::legalizeExtractSubvector(MachineInstr &MI,
   const LLT XLenTy(STI.getXLenVT());
   auto SlidedownAmt = MIB.buildVScale(XLenTy, RemIdx);
   auto [Mask, VL] = buildDefaultVLOps(LitTy, MIB, MRI);
-  uint64_t Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
+  uint64_t Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
   auto Slidedown = MIB.buildInstr(
       RISCV::G_VSLIDEDOWN_VL, {InterLitTy},
       {MIB.buildUndef(InterLitTy), Vec, SlidedownAmt, Mask, VL, Policy});
@@ -1265,10 +1265,10 @@ bool RISCVLegalizerInfo::legalizeInsertSubvector(MachineInstr &MI,
     // Use tail agnostic policy if we're inserting over InterLitTy's tail.
     ElementCount EndIndex =
         ElementCount::getScalable(RemIdx) + LitTy.getElementCount();
-    uint64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+    uint64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED;
     if (STI.expandVScale(EndIndex) ==
         STI.expandVScale(InterLitTy.getElementCount()))
-      Policy = RISCVII::TAIL_AGNOSTIC;
+      Policy = RISCVVType::TAIL_AGNOSTIC;
 
     Inserted =
         MIB.buildInstr(RISCV::G_VSLIDEUP_VL, {InsertedDst},
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
index fb0dc482e6081..0881de90700ab 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.cpp
@@ -107,32 +107,32 @@ RISCVInstrumentManager::createInstruments(const MCInst &Inst) {
     LLVM_DEBUG(dbgs() << "RVCB: Found VSETVLI and creating instrument for it: "
                       << Inst << "\n");
     unsigned VTypeI = Inst.getOperand(2).getImm();
-    RISCVII::VLMUL VLMUL = RISCVVType::getVLMUL(VTypeI);
+    RISCVVType::VLMUL VLMUL = RISCVVType::getVLMUL(VTypeI);
 
     StringRef LMUL;
     switch (VLMUL) {
-    case RISCVII::LMUL_1:
+    case RISCVVType::LMUL_1:
       LMUL = "M1";
       break;
-    case RISCVII::LMUL_2:
+    case RISCVVType::LMUL_2:
       LMUL = "M2";
       break;
-    case RISCVII::LMUL_4:
+    case RISCVVType::LMUL_4:
       LMUL = "M4";
       break;
-    case RISCVII::LMUL_8:
+    case RISCVVType::LMUL_8:
       LMUL = "M8";
       break;
-    case RISCVII::LMUL_F2:
+    case RISCVVType::LMUL_F2:
       LMUL = "MF2";
       break;
-    case RISCVII::LMUL_F4:
+    case RISCVVType::LMUL_F4:
       LMUL = "MF4";
       break;
-    case RISCVII::LMUL_F8:
+    case RISCVVType::LMUL_F8:
       LMUL = "MF8";
       break;
-    case RISCVII::LMUL_RESERVED:
+    case RISCVVType::LMUL_RESERVED:
       llvm_unreachable("Cannot create instrument for LMUL_RESERVED");
     }
     SmallVector<UniqueInstrument> Instruments;
@@ -166,7 +166,7 @@ RISCVInstrumentManager::createInstruments(const MCInst &Inst) {
 }
 
 static std::pair<uint8_t, uint8_t>
-getEEWAndEMUL(unsigned Opcode, RISCVII::VLMUL LMUL, uint8_t SEW) {
+getEEWAndEMUL(unsigned Opcode, RISCVVType::VLMUL LMUL, uint8_t SEW) {
   uint8_t EEW;
   switch (Opcode) {
   case RISCV::VLM_V:
@@ -249,7 +249,7 @@ unsigned RISCVInstrumentManager::getSchedClassID(
 
   const RISCVVInversePseudosTable::PseudoInfo *RVV = nullptr;
   if (opcodeHasEEWAndEMULInfo(Opcode)) {
-    RISCVII::VLMUL VLMUL = static_cast<RISCVII::VLMUL>(LMUL);
+    RISCVVType::VLMUL VLMUL = static_cast<RISCVVType::VLMUL>(LMUL);
     auto [EEW, EMUL] = getEEWAndEMUL(Opcode, VLMUL, SEW);
     RVV = RISCVVInversePseudosTable::getBaseInfo(Opcode, EMUL, EEW);
   } else {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 2f4b569041a6f..58eb48ed613df 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -65,13 +65,9 @@ enum {
   VLMulShift = ConstraintShift + 3,
   VLMulMask = 0b111 << VLMulShift,
 
-  // Force a tail agnostic policy even this instruction has a tied destination.
-  ForceTailAgnosticShift = VLMulShift + 3,
-  ForceTailAgnosticMask = 1 << ForceTailAgnosticShift,
-
   // Is this a _TIED vector pseudo instruction. For these instructions we
   // shouldn't skip the tied operand when converting to MC instructions.
-  IsTiedPseudoShift = ForceTailAgnosticShift + 1,
+  IsTiedPseudoShift = VLMulShift + 3,
   IsTiedPseudoMask = 1 << IsTiedPseudoShift,
 
   // Does this instruction have a SEW operand. It will be the last explicit
@@ -145,12 +141,8 @@ static inline unsigned getFormat(uint64_t TSFlags) {
   return (TSFlags & InstFormatMask) >> InstFormatShift;
 }
 /// \returns the LMUL for the instruction.
-static inline VLMUL getLMul(uint64_t TSFlags) {
-  return static_cast<VLMUL>((TSFlags & VLMulMask) >> VLMulShift);
-}
-/// \returns true if tail agnostic is enforced for the instruction.
-static inline bool doesForceTailAgnostic(uint64_t TSFlags) {
-  return TSFlags & ForceTailAgnosticMask;
+static inline RISCVVType::VLMUL getLMul(uint64_t TSFlags) {
+  return static_cast<RISCVVType::VLMUL>((TSFlags & VLMulMask) >> VLMulShift);
 }
 /// \returns true if this a _TIED pseudo.
 static inline bool isTiedPseudo(uint64_t TSFlags) {
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index d5254719b3839..a4a40862a67c6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -210,7 +210,7 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
   unsigned Imm = MI->getOperand(OpNo).getImm();
   // Print the raw immediate for reserved values: vlmul[2:0]=4, vsew[2:0]=0b1xx,
   // or non-zero in bits 8 and above.
-  if (RISCVVType::getVLMUL(Imm) == RISCVII::VLMUL::LMUL_RESERVED ||
+  if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED ||
       RISCVVType::getSEW(Imm) > 64 || (Imm >> 8) != 0) {
     O << formatImm(Imm);
     return;
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index fb2c5c62ef871..7ea4bd94c0065 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -279,7 +279,7 @@ void RISCVDAGToDAGISel::addVectorLoadStoreOperands(
   // none of the others do.  All have passthru operands.  For our pseudos,
   // all loads have policy operands.
   if (IsLoad) {
-    uint64_t Policy = RISCVII::MASK_AGNOSTIC;
+    uint64_t Policy = RISCVVType::MASK_AGNOSTIC;
     if (IsMasked)
       Policy = Node->getConstantOperandVal(CurOp++);
     SDValue PolicyOp = CurDAG->getTargetConstant(Policy, DL, XLenVT);
@@ -294,7 +294,7 @@ void RISCVDAGToDAGISel::selectVLSEG(SDNode *Node, unsigned NF, bool IsMasked,
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
   unsigned Log2SEW = Node->getConstantOperandVal(Node->getNumOperands() - 1);
-  RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 8> Operands;
@@ -324,7 +324,7 @@ void RISCVDAGToDAGISel::selectVLSEGFF(SDNode *Node, unsigned NF,
   MVT VT = Node->getSimpleValueType(0);
   MVT XLenVT = Subtarget->getXLenVT();
   unsigned Log2SEW = Node->getConstantOperandVal(Node->getNumOperands() - 1);
-  RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 7> Operands;
@@ -355,7 +355,7 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
   unsigned Log2SEW = Node->getConstantOperandVal(Node->getNumOperands() - 1);
-  RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 8> Operands;
@@ -379,7 +379,7 @@ void RISCVDAGToDAGISel::selectVLXSEG(SDNode *Node, unsigned NF, bool IsMasked,
          "Element count mismatch");
 #endif
 
-  RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+  RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
     report_fatal_error("The V extension does not support EEW=64 for index "
@@ -404,7 +404,7 @@ void RISCVDAGToDAGISel::selectVSSEG(SDNode *Node, unsigned NF, bool IsMasked,
   SDLoc DL(Node);
   MVT VT = Node->getOperand(2)->getSimpleValueType(0);
   unsigned Log2SEW = Node->getConstantOperandVal(Node->getNumOperands() - 1);
-  RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 8> Operands;
@@ -430,7 +430,7 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked,
   SDLoc DL(Node);
   MVT VT = Node->getOperand(2)->getSimpleValueType(0);
   unsigned Log2SEW = Node->getConstantOperandVal(Node->getNumOperands() - 1);
-  RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
 
   unsigned CurOp = 2;
   SmallVector<SDValue, 8> Operands;
@@ -454,7 +454,7 @@ void RISCVDAGToDAGISel::selectVSXSEG(SDNode *Node, unsigned NF, bool IsMasked,
          "Element count mismatch");
 #endif
 
-  RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+  RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
   unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
   if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
     report_fatal_error("The V extension does not support EEW=64 for index "
@@ -495,7 +495,7 @@ void RISCVDAGToDAGISel::selectVSETVLI(SDNode *Node) {
 
   unsigned SEW =
       RISCVVType::decodeVSEW(Node->getConstantOperandVal(Offset) & 0x7);
-  RISCVII::VLMUL VLMul = static_cast<RISCVII::VLMUL>(
+  RISCVVType::VLMUL VLMul = static_cast<RISCVVType::VLMUL>(
       Node->getConstantOperandVal(Offset + 1) & 0x7);
 
   unsigned VTypeI = RISCVVType::encodeVTYPE(VLMul, SEW, /*TailAgnostic*/ true,
@@ -1672,7 +1672,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       default:
         llvm_unreachable("Unexpected LMUL!");
 #define CASE_VMSLT_OPCODES(lmulenum, suffix)                                   \
-  case RISCVII::VLMUL::lmulenum:                                               \
+  case RISCVVType::lmulenum:                                                   \
     VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix                 \
                              : RISCV::PseudoVMSLT_VX_##suffix;                 \
     VMSGTOpcode = IsUnsigned ? RISCV::PseudoVMSGTU_VX_##suffix                 \
@@ -1692,7 +1692,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       default:
         llvm_unreachable("Unexpected LMUL!");
 #define CASE_VMNAND_VMSET_OPCODES(lmulenum, suffix)                            \
-  case RISCVII::VLMUL::lmulenum:                                               \
+  case RISCVVType::lmulenum:                                                   \
     VMNANDOpcode = RISCV::PseudoVMNAND_MM_##suffix;                            \
     VMSetOpcode = RISCV::PseudoVMSET_M_##suffix;                               \
     break;
@@ -1768,7 +1768,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       default:
         llvm_unreachable("Unexpected LMUL!");
 #define CASE_VMSLT_OPCODES(lmulenum, suffix)                                   \
-  case RISCVII::VLMUL::lmulenum:                                               \
+  case RISCVVType::lmulenum:                                                   \
     VMSLTOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix                 \
                              : RISCV::PseudoVMSLT_VX_##suffix;                 \
     VMSLTMaskOpcode = IsUnsigned ? RISCV::PseudoVMSLTU_VX_##suffix##_MASK      \
@@ -1790,7 +1790,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       default:
         llvm_unreachable("Unexpected LMUL!");
 #define CASE_VMXOR_VMANDN_VMOR_OPCODES(lmulenum, suffix)                       \
-  case RISCVII::VLMUL::lmulenum:                                               \
+  case RISCVVType::lmulenum:                                                   \
     VMXOROpcode = RISCV::PseudoVMXOR_MM_##suffix;                              \
     VMANDNOpcode = RISCV::PseudoVMANDN_MM_##suffix;                            \
     VMOROpcode = RISCV::PseudoVMOR_MM_##suffix;                                \
@@ -1838,13 +1838,16 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         return;
       }
 
+      SDValue PolicyOp =
+          CurDAG->getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT);
+
       if (IsCmpConstant) {
         SDValue Imm =
             selectImm(CurDAG, SDLoc(Src2), XLenVT, CVal - 1, *Subtarget);
 
         ReplaceNode(Node, CurDAG->getMachineNode(
                               VMSGTMaskOpcode, DL, VT,
-                              {MaskedOff, Src1, Imm, Mask, VL, SEW}));
+                              {MaskedOff, Src1, Imm, Mask, VL, SEW, PolicyOp}));
         return;
       }
 
@@ -1853,10 +1856,10 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       // The result is mask undisturbed.
       // We use the same instructions to emulate mask agnostic behavior, because
       // the agnostic result can be either undisturbed or all 1.
-      SDValue Cmp = SDValue(
-          CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
-                                 {MaskedOff, Src1, Src2, Mask, VL, SEW}),
-          0);
+      SDValue Cmp = SDValue(CurDAG->getMachineNode(VMSLTMaskOpcode, DL, VT,
+                                                   {MaskedOff, Src1, Src2, Mask,
+                                                    VL, SEW, PolicyOp}),
+                            0);
       // vmxor.mm vd, vd, v0 is used to update active value.
       ReplaceNode(Node, CurDAG->getMachineNode(VMXOROpcode, DL, VT,
                                                {Cmp, Mask, VL, MaskSEW}));
@@ -2002,8 +2005,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
              "Element count mismatch");
 
-      RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
-      RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+      RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+      RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
         report_fatal_error("The V extension does not support EEW=64 for index "
@@ -2055,7 +2058,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
                                  Operands, /*IsLoad=*/true);
 
-      RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+      RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       const RISCV::VLEPseudo *P =
           RISCV::getVLEPseudo(IsMasked, IsStrided, /*FF*/ false, Log2SEW,
                               static_cast<unsigned>(LMUL));
@@ -2082,7 +2085,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
                                  /*IsStridedOrIndexed*/ false, Operands,
                                  /*IsLoad=*/true);
 
-      RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+      RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       const RISCV::VLEPseudo *P =
           RISCV::getVLEPseudo(IsMasked, /*Strided*/ false, /*FF*/ true,
                               Log2SEW, static_cast<unsigned>(LMUL));
@@ -2208,8 +2211,8 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       assert(VT.getVectorElementCount() == IndexVT.getVectorElementCount() &&
              "Element count mismatch");
 
-      RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
-      RISCVII::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
+      RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+      RISCVVType::VLMUL IndexLMUL = RISCVTargetLowering::getLMUL(IndexVT);
       unsigned IndexLog2EEW = Log2_32(IndexVT.getScalarSizeInBits());
       if (IndexLog2EEW == 6 && !Subtarget->is64Bit()) {
         report_fatal_error("The V extension does not support EEW=64 for index "
@@ -2247,7 +2250,7 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
       addVectorLoadStoreOperands(Node, Log2SEW, DL, CurOp, IsMasked, IsStrided,
                                  Operands);
 
-      RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+      RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
       const RISCV::VSEPseudo *P = RISCV::getVSEPseudo(
           IsMasked, IsStrided, Log2SEW, static_cast<unsigned>(LMUL));
       MachineSDNode *Store =
@@ -2314,11 +2317,12 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
     if (Idx != 0)
       break;
 
-    RISCVII::VLMUL SubVecLMUL = RISCVTargetLowering::getLMUL(SubVecContainerVT);
+    RISCVVType::VLMUL SubVecLMUL =
+        RISCVTargetLowering::getLMUL(SubVecContainerVT);
     [[maybe_unused]] bool IsSubVecPartReg =
-        SubVecLMUL == RISCVII::VLMUL::LMUL_F2 ||
-        SubVecLMUL == RISCVII::VLMUL::LMUL_F4 ||
-        SubVecLMUL == RISCVII::VLMUL::LMUL_F8;
+        SubVecLMUL == RISCVVType::VLMUL::LMUL_F2 ||
+        SubVecLMUL == RISCVVType::VLMUL::LMUL_F4 ||
+        SubVecLMUL == RISCVVType::VLMUL::LMUL_F8;
     assert((V.getValueType().isRISCVVectorTuple() || !IsSubVecPartReg ||
             V.isUndef()) &&
            "Expecting lowering to have created legal INSERT_SUBVECTORs when "
@@ -2439,11 +2443,11 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
         Ld->getBasePtr()};
     if (IsStrided)
       Operands.push_back(CurDAG->getRegister(RISCV::X0, XLenVT));
-    uint64_t Policy = RISCVII::MASK_AGNOSTIC | RISCVII::TAIL_AGNOSTIC;
+    uint64_t Policy = RISCVVType::MASK_AGNOSTIC | RISCVVType::TAIL_AGNOSTIC;
     SDValue PolicyOp = CurDAG->getTargetConstant(Policy, DL, XLenVT);
     Operands.append({VL, SEW, PolicyOp, Ld->getChain()});
 
-    RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+    RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
     const RISCV::VLEPseudo *P = RISCV::getVLEPseudo(
         /*IsMasked*/ false, IsStrided, /*FF*/ false,
         Log2SEW, static_cast<unsigned>(LMUL));
@@ -3792,9 +3796,9 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
   const MCInstrDesc &MaskedMCID = TII->get(N->getMachineOpcode());
   const bool MaskedHasPassthru = RISCVII::isFirstDefTiedToFirstUse(MaskedMCID);
 
-  assert(RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) ==
-         RISCVII::hasVecPolicyOp(MCID.TSFlags) &&
-         "Masked and unmasked pseudos are inconsistent");
+  assert((RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) ||
+          !RISCVII::hasVecPolicyOp(MCID.TSFlags)) &&
+         "Unmasked pseudo has policy but masked pseudo doesn't?");
   assert(RISCVII::hasVecPolicyOp(MCID.TSFlags) == HasPassthru &&
          "Unexpected pseudo structure");
   assert(!(HasPassthru && !MaskedHasPassthru) &&
@@ -3803,11 +3807,18 @@ bool RISCVDAGToDAGISel::doPeepholeMaskedRVV(MachineSDNode *N) {
   SmallVector<SDValue, 8> Ops;
   // Skip the passthru operand at index 0 if the unmasked don't have one.
   bool ShouldSkip = !HasPassthru && MaskedHasPassthru;
+  bool DropPolicy = !RISCVII::hasVecPolicyOp(MCID.TSFlags) &&
+                    RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags);
+  bool HasChainOp =
+      N->getOperand(N->getNumOperands() - 1).getValueType() == MVT::Other;
+  unsigned LastOpNum = N->getNumOperands() - 1 - HasChainOp;
   for (unsigned I = ShouldSkip, E = N->getNumOperands(); I != E; I++) {
     // Skip the mask
     SDValue Op = N->getOperand(I);
     if (I == MaskOpIdx)
       continue;
+    if (DropPolicy && I == LastOpNum)
+      continue;
     Ops.push_back(Op);
   }
 
@@ -3975,7 +3986,7 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   // preserve them.
   bool MergeVLShrunk = VL != OrigVL;
   uint64_t Policy = (isImplicitDef(Passthru) && !MergeVLShrunk)
-                        ? RISCVII::TAIL_AGNOSTIC
+                        ? RISCVVType::TAIL_AGNOSTIC
                         : /*TUMU*/ 0;
   SDValue PolicyOp =
     CurDAG->getTargetConstant(Policy, DL, Subtarget->getXLenVT());
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index c40ab0d09bdf6..98c25bc93a8a2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1110,7 +1110,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(FloatingPointLibCallOps, VT, Expand);
 
       // Custom split nxv32[b]f16 since nxv32[b]f32 is not legal.
-      if (getLMUL(VT) == RISCVII::VLMUL::LMUL_8) {
+      if (getLMUL(VT) == RISCVVType::LMUL_8) {
         setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom);
         setOperationAction(ZvfhminZvfbfminPromoteVPOps, VT, Custom);
       } else {
@@ -2361,25 +2361,25 @@ static void translateSetCCForBranch(const SDLoc &DL, SDValue &LHS, SDValue &RHS,
   }
 }
 
-RISCVII::VLMUL RISCVTargetLowering::getLMUL(MVT VT) {
+RISCVVType::VLMUL RISCVTargetLowering::getLMUL(MVT VT) {
   if (VT.isRISCVVectorTuple()) {
     if (VT.SimpleTy >= MVT::riscv_nxv1i8x2 &&
         VT.SimpleTy <= MVT::riscv_nxv1i8x8)
-      return RISCVII::LMUL_F8;
+      return RISCVVType::LMUL_F8;
     if (VT.SimpleTy >= MVT::riscv_nxv2i8x2 &&
         VT.SimpleTy <= MVT::riscv_nxv2i8x8)
-      return RISCVII::LMUL_F4;
+      return RISCVVType::LMUL_F4;
     if (VT.SimpleTy >= MVT::riscv_nxv4i8x2 &&
         VT.SimpleTy <= MVT::riscv_nxv4i8x8)
-      return RISCVII::LMUL_F2;
+      return RISCVVType::LMUL_F2;
     if (VT.SimpleTy >= MVT::riscv_nxv8i8x2 &&
         VT.SimpleTy <= MVT::riscv_nxv8i8x8)
-      return RISCVII::LMUL_1;
+      return RISCVVType::LMUL_1;
     if (VT.SimpleTy >= MVT::riscv_nxv16i8x2 &&
         VT.SimpleTy <= MVT::riscv_nxv16i8x4)
-      return RISCVII::LMUL_2;
+      return RISCVVType::LMUL_2;
     if (VT.SimpleTy == MVT::riscv_nxv32i8x2)
-      return RISCVII::LMUL_4;
+      return RISCVVType::LMUL_4;
     llvm_unreachable("Invalid vector tuple type LMUL.");
   }
 
@@ -2392,56 +2392,54 @@ RISCVII::VLMUL RISCVTargetLowering::getLMUL(MVT VT) {
   default:
     llvm_unreachable("Invalid LMUL.");
   case 8:
-    return RISCVII::VLMUL::LMUL_F8;
+    return RISCVVType::LMUL_F8;
   case 16:
-    return RISCVII::VLMUL::LMUL_F4;
+    return RISCVVType::LMUL_F4;
   case 32:
-    return RISCVII::VLMUL::LMUL_F2;
+    return RISCVVType::LMUL_F2;
   case 64:
-    return RISCVII::VLMUL::LMUL_1;
+    return RISCVVType::LMUL_1;
   case 128:
-    return RISCVII::VLMUL::LMUL_2;
+    return RISCVVType::LMUL_2;
   case 256:
-    return RISCVII::VLMUL::LMUL_4;
+    return RISCVVType::LMUL_4;
   case 512:
-    return RISCVII::VLMUL::LMUL_8;
+    return RISCVVType::LMUL_8;
   }
 }
 
-unsigned RISCVTargetLowering::getRegClassIDForLMUL(RISCVII::VLMUL LMul) {
+unsigned RISCVTargetLowering::getRegClassIDForLMUL(RISCVVType::VLMUL LMul) {
   switch (LMul) {
   default:
     llvm_unreachable("Invalid LMUL.");
-  case RISCVII::VLMUL::LMUL_F8:
-  case RISCVII::VLMUL::LMUL_F4:
-  case RISCVII::VLMUL::LMUL_F2:
-  case RISCVII::VLMUL::LMUL_1:
+  case RISCVVType::LMUL_F8:
+  case RISCVVType::LMUL_F4:
+  case RISCVVType::LMUL_F2:
+  case RISCVVType::LMUL_1:
     return RISCV::VRRegClassID;
-  case RISCVII::VLMUL::LMUL_2:
+  case RISCVVType::LMUL_2:
     return RISCV::VRM2RegClassID;
-  case RISCVII::VLMUL::LMUL_4:
+  case RISCVVType::LMUL_4:
     return RISCV::VRM4RegClassID;
-  case RISCVII::VLMUL::LMUL_8:
+  case RISCVVType::LMUL_8:
     return RISCV::VRM8RegClassID;
   }
 }
 
 unsigned RISCVTargetLowering::getSubregIndexByMVT(MVT VT, unsigned Index) {
-  RISCVII::VLMUL LMUL = getLMUL(VT);
-  if (LMUL == RISCVII::VLMUL::LMUL_F8 ||
-      LMUL == RISCVII::VLMUL::LMUL_F4 ||
-      LMUL == RISCVII::VLMUL::LMUL_F2 ||
-      LMUL == RISCVII::VLMUL::LMUL_1) {
+  RISCVVType::VLMUL LMUL = getLMUL(VT);
+  if (LMUL == RISCVVType::LMUL_F8 || LMUL == RISCVVType::LMUL_F4 ||
+      LMUL == RISCVVType::LMUL_F2 || LMUL == RISCVVType::LMUL_1) {
     static_assert(RISCV::sub_vrm1_7 == RISCV::sub_vrm1_0 + 7,
                   "Unexpected subreg numbering");
     return RISCV::sub_vrm1_0 + Index;
   }
-  if (LMUL == RISCVII::VLMUL::LMUL_2) {
+  if (LMUL == RISCVVType::LMUL_2) {
     static_assert(RISCV::sub_vrm2_3 == RISCV::sub_vrm2_0 + 3,
                   "Unexpected subreg numbering");
     return RISCV::sub_vrm2_0 + Index;
   }
-  if (LMUL == RISCVII::VLMUL::LMUL_4) {
+  if (LMUL == RISCVVType::LMUL_4) {
     static_assert(RISCV::sub_vrm4_1 == RISCV::sub_vrm4_0 + 1,
                   "Unexpected subreg numbering");
     return RISCV::sub_vrm4_0 + Index;
@@ -3347,9 +3345,9 @@ static SDValue
 getVSlidedown(SelectionDAG &DAG, const RISCVSubtarget &Subtarget,
               const SDLoc &DL, EVT VT, SDValue Passthru, SDValue Op,
               SDValue Offset, SDValue Mask, SDValue VL,
-              unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
+              unsigned Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
   if (Passthru.isUndef())
-    Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
+    Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
   SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
   SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
   return DAG.getNode(RISCVISD::VSLIDEDOWN_VL, DL, VT, Ops);
@@ -3359,9 +3357,9 @@ static SDValue
 getVSlideup(SelectionDAG &DAG, const RISCVSubtarget &Subtarget, const SDLoc &DL,
             EVT VT, SDValue Passthru, SDValue Op, SDValue Offset, SDValue Mask,
             SDValue VL,
-            unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
+            unsigned Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED) {
   if (Passthru.isUndef())
-    Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
+    Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
   SDValue PolicyOp = DAG.getTargetConstant(Policy, DL, Subtarget.getXLenVT());
   SDValue Ops[] = {Passthru, Op, Offset, Mask, VL, PolicyOp};
   return DAG.getNode(RISCVISD::VSLIDEUP_VL, DL, VT, Ops);
@@ -4245,13 +4243,13 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   InstructionCost PerSlideCost = 1;
   switch (RISCVTargetLowering::getLMUL(ContainerVT)) {
   default: break;
-  case RISCVII::VLMUL::LMUL_2:
+  case RISCVVType::LMUL_2:
     PerSlideCost = 2;
     break;
-  case RISCVII::VLMUL::LMUL_4:
+  case RISCVVType::LMUL_4:
     PerSlideCost = 4;
     break;
-  case RISCVII::VLMUL::LMUL_8:
+  case RISCVVType::LMUL_8:
     PerSlideCost = 8;
     break;
   }
@@ -4281,7 +4279,7 @@ static SDValue lowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
           VT.getVectorElementType().getSizeInBits() <= Subtarget.getFLen()) &&
          "Illegal type which will result in reserved encoding");
 
-  const unsigned Policy = RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC;
+  const unsigned Policy = RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC;
 
   SDValue Vec;
   UndefCount = 0;
@@ -4773,11 +4771,12 @@ static SDValue lowerVECTOR_SHUFFLEAsVSlideup(const SDLoc &DL, MVT VT,
   auto TrueMask = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget).first;
   // We slide up by the index that the subvector is being inserted at, and set
   // VL to the index + the number of elements being inserted.
-  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVII::MASK_AGNOSTIC;
+  unsigned Policy =
+      RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED | RISCVVType::MASK_AGNOSTIC;
   // If the we're adding a suffix to the in place vector, i.e. inserting right
   // up to the very end of it, then we don't actually care about the tail.
   if (NumSubElts + Index >= (int)NumElts)
-    Policy |= RISCVII::TAIL_AGNOSTIC;
+    Policy |= RISCVVType::TAIL_AGNOSTIC;
 
   InPlace = convertToScalableVector(ContainerVT, InPlace, DAG, Subtarget);
   ToInsert = convertToScalableVector(ContainerVT, ToInsert, DAG, Subtarget);
@@ -5570,7 +5569,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     if (LoV)
       Res = getVSlideup(DAG, Subtarget, DL, ContainerVT, Res, LoV,
                         DAG.getConstant(InvRotate, DL, XLenVT), TrueMask, VL,
-                        RISCVII::TAIL_AGNOSTIC);
+                        RISCVVType::TAIL_AGNOSTIC);
 
     return convertFromScalableVector(VT, Res, DAG, Subtarget);
   }
@@ -9457,10 +9456,10 @@ SDValue RISCVTargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
       DAG.getNode(ISD::ADD, DL, XLenVT, Idx, DAG.getConstant(1, DL, XLenVT));
 
   // Use tail agnostic policy if Idx is the last index of Vec.
-  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  unsigned Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED;
   if (VecVT.isFixedLengthVector() && isa<ConstantSDNode>(Idx) &&
       Idx->getAsZExtVal() + 1 == VecVT.getVectorNumElements())
-    Policy = RISCVII::TAIL_AGNOSTIC;
+    Policy = RISCVVType::TAIL_AGNOSTIC;
   SDValue Slideup = getVSlideup(DAG, Subtarget, DL, ContainerVT, Vec, ValInVec,
                                 Idx, Mask, InsertVL, Policy);
 
@@ -9740,7 +9739,7 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
       }
     }
     if (!I32VL) {
-      RISCVII::VLMUL Lmul = RISCVTargetLowering::getLMUL(VT);
+      RISCVVType::VLMUL Lmul = RISCVTargetLowering::getLMUL(VT);
       SDValue LMUL = DAG.getConstant(Lmul, DL, XLenVT);
       unsigned Sew = RISCVVType::encodeSEW(VT.getScalarSizeInBits());
       SDValue SEW = DAG.getConstant(Sew, DL, XLenVT);
@@ -9791,7 +9790,7 @@ static SDValue lowerVectorIntrinsicScalars(SDValue Op, SelectionDAG &DAG,
     if (MaskedOff.isUndef())
       return Vec;
     // TAMU
-    if (Policy == RISCVII::TAIL_AGNOSTIC)
+    if (Policy == RISCVVType::TAIL_AGNOSTIC)
       return DAG.getNode(RISCVISD::VMERGE_VL, DL, VT, Mask, Vec, MaskedOff,
                          DAG.getUNDEF(VT), AVL);
     // TUMA or TUMU: Currently we always emit tumu policy regardless of tuma.
@@ -10547,7 +10546,7 @@ static SDValue lowerReductionSeq(unsigned RVVOpcode, MVT ResVT,
         DAG.getNode(ISD::INSERT_SUBVECTOR, DL, M1VT, DAG.getUNDEF(M1VT),
                     InitialValue, DAG.getVectorIdxConstant(0, DL));
   SDValue PassThru = NonZeroAVL ? DAG.getUNDEF(M1VT) : InitialValue;
-  SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
+  SDValue Policy = DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT);
   SDValue Ops[] = {PassThru, Vec, InitialValue, Mask, VL, Policy};
   SDValue Reduction = DAG.getNode(RVVOpcode, DL, M1VT, Ops);
   return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResVT, Reduction,
@@ -10807,9 +10806,9 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
     SDValue VL = DAG.getConstant(EndIndex, DL, XLenVT);
 
     // Use tail agnostic policy if we're inserting over Vec's tail.
-    unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+    unsigned Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED;
     if (VecVT.isFixedLengthVector() && EndIndex == VecVT.getVectorNumElements())
-      Policy = RISCVII::TAIL_AGNOSTIC;
+      Policy = RISCVVType::TAIL_AGNOSTIC;
 
     // If we're inserting into the lowest elements, use a tail undisturbed
     // vmv.v.v.
@@ -10933,10 +10932,10 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
   VL = DAG.getElementCount(DL, XLenVT, SubVecVT.getVectorElementCount());
 
   // Use tail agnostic policy if we're inserting over InterSubVT's tail.
-  unsigned Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  unsigned Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED;
   if (Subtarget.expandVScale(EndIndex) ==
       Subtarget.expandVScale(InterSubVT.getVectorElementCount()))
-    Policy = RISCVII::TAIL_AGNOSTIC;
+    Policy = RISCVVType::TAIL_AGNOSTIC;
 
   // If we're inserting into the lowest elements, use a tail undisturbed
   // vmv.v.v.
@@ -11108,7 +11107,7 @@ SDValue RISCVTargetLowering::lowerEXTRACT_SUBVECTOR(SDValue Op,
   // was > M1 then the index would need to be a multiple of VLMAX, and so would
   // divide exactly.
   assert(RISCVVType::decodeVLMUL(getLMUL(ContainerSubVecVT)).second ||
-         getLMUL(ContainerSubVecVT) == RISCVII::VLMUL::LMUL_1);
+         getLMUL(ContainerSubVecVT) == RISCVVType::LMUL_1);
 
   // If the vector type is an LMUL-group type, extract a subvector equal to the
   // nearest full vector register type.
@@ -11719,7 +11718,7 @@ SDValue RISCVTargetLowering::lowerVECTOR_SPLICE(SDValue Op,
                     DownOffset, TrueMask, UpOffset);
   return getVSlideup(DAG, Subtarget, DL, VecVT, SlideDown, V2, UpOffset,
                      TrueMask, DAG.getRegister(RISCV::X0, XLenVT),
-                     RISCVII::TAIL_AGNOSTIC);
+                     RISCVVType::TAIL_AGNOSTIC);
 }
 
 SDValue
@@ -11883,7 +11882,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
     Ops.push_back(Mask);
   Ops.push_back(VL);
   if (IntID == Intrinsic::riscv_vle_mask)
-    Ops.push_back(DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT));
+    Ops.push_back(DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT));
 
   SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
 
@@ -11902,7 +11901,7 @@ SDValue RISCVTargetLowering::lowerMaskedLoad(SDValue Op,
     // overflow.
     if (IndexEltVT == MVT::i8 && VT.getVectorNumElements() > 256) {
       // FIXME: We need to do vector splitting manually for LMUL=8 cases.
-      assert(getLMUL(IndexVT) != RISCVII::LMUL_8);
+      assert(getLMUL(IndexVT) != RISCVVType::LMUL_8);
       IndexVT = IndexVT.changeVectorElementType(MVT::i16);
       UseVRGATHEREI16 = true;
     }
@@ -12698,7 +12697,7 @@ RISCVTargetLowering::lowerVPSpliceExperimental(SDValue Op,
       getVSlidedown(DAG, Subtarget, DL, ContainerVT, DAG.getUNDEF(ContainerVT),
                     Op1, DownOffset, Mask, UpOffset);
   SDValue Result = getVSlideup(DAG, Subtarget, DL, ContainerVT, SlideDown, Op2,
-                               UpOffset, Mask, EVL2, RISCVII::TAIL_AGNOSTIC);
+                               UpOffset, Mask, EVL2, RISCVVType::TAIL_AGNOSTIC);
 
   if (IsMaskVector) {
     // Truncate Result back to a mask vector (Result has same EVL as Op2)
@@ -12915,7 +12914,8 @@ SDValue RISCVTargetLowering::lowerVPStridedLoad(SDValue Op,
   }
   Ops.push_back(VPNode->getVectorLength());
   if (!IsUnmasked) {
-    SDValue Policy = DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT);
+    SDValue Policy =
+        DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT);
     Ops.push_back(Policy);
   }
 
@@ -13053,7 +13053,7 @@ SDValue RISCVTargetLowering::lowerMaskedGather(SDValue Op,
     Ops.push_back(Mask);
   Ops.push_back(VL);
   if (!IsUnmasked)
-    Ops.push_back(DAG.getTargetConstant(RISCVII::TAIL_AGNOSTIC, DL, XLenVT));
+    Ops.push_back(DAG.getTargetConstant(RISCVVType::TAIL_AGNOSTIC, DL, XLenVT));
 
   SDVTList VTs = DAG.getVTList({ContainerVT, MVT::Other});
   SDValue Result =
@@ -19553,8 +19553,8 @@ void RISCVTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     case Intrinsic::riscv_vsetvlimax: {
       bool HasAVL = IntNo == Intrinsic::riscv_vsetvli;
       unsigned VSEW = Op.getConstantOperandVal(HasAVL + 1);
-      RISCVII::VLMUL VLMUL =
-          static_cast<RISCVII::VLMUL>(Op.getConstantOperandVal(HasAVL + 2));
+      RISCVVType::VLMUL VLMUL =
+          static_cast<RISCVVType::VLMUL>(Op.getConstantOperandVal(HasAVL + 2));
       unsigned SEW = RISCVVType::decodeVSEW(VSEW);
       auto [LMul, Fractional] = RISCVVType::decodeVLMUL(VLMUL);
       uint64_t MaxVL = Subtarget.getRealMaxVLen() / SEW;
@@ -20168,7 +20168,7 @@ static MachineBasicBlock *emitSelectPseudo(MachineInstr &MI,
 
 // Helper to find Masked Pseudo instruction from MC instruction, LMUL and SEW.
 static const RISCV::RISCVMaskedPseudoInfo *
-lookupMaskedIntrinsic(uint16_t MCOpcode, RISCVII::VLMUL LMul, unsigned SEW) {
+lookupMaskedIntrinsic(uint16_t MCOpcode, RISCVVType::VLMUL LMul, unsigned SEW) {
   const RISCVVInversePseudosTable::PseudoInfo *Inverse =
       RISCVVInversePseudosTable::getBaseInfo(MCOpcode, LMul, SEW);
   assert(Inverse && "Unexpected LMUL and SEW pair for instruction");
@@ -20211,7 +20211,7 @@ static MachineBasicBlock *emitVFROUND_NOEXCEPT_MASK(MachineInstr &MI,
                                      /*IsImp*/ true));
 
   // Emit a VFCVT_F_X
-  RISCVII::VLMUL LMul = RISCVII::getLMul(MI.getDesc().TSFlags);
+  RISCVVType::VLMUL LMul = RISCVII::getLMul(MI.getDesc().TSFlags);
   unsigned Log2SEW = MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
   // There is no E8 variant for VFCVT_F_X.
   assert(Log2SEW >= 4);
@@ -23262,13 +23262,13 @@ bool RISCVTargetLowering::lowerDeinterleavedIntrinsicToVPLoad(
       Load->getModule(), IntrMaskIds[Factor - 2],
       {VecTupTy, Mask->getType(), EVL->getType()});
 
-  Value *Operands[] = {
-      PoisonVal,
-      Load->getArgOperand(0),
-      Mask,
-      EVL,
-      ConstantInt::get(XLenTy, RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC),
-      ConstantInt::get(XLenTy, Log2_64(SEW))};
+  Value *Operands[] = {PoisonVal,
+                       Load->getArgOperand(0),
+                       Mask,
+                       EVL,
+                       ConstantInt::get(XLenTy, RISCVVType::TAIL_AGNOSTIC |
+                                                    RISCVVType::MASK_AGNOSTIC),
+                       ConstantInt::get(XLenTy, Log2_64(SEW))};
 
   CallInst *VlsegN = Builder.CreateCall(VlsegNFunc, Operands);
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index e9dd8ff96fa37..26b888653c81d 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -823,7 +823,7 @@ class RISCVTargetLowering : public TargetLowering {
   // Return the value of VLMax for the given vector type (i.e. SEW and LMUL)
   SDValue computeVLMax(MVT VecVT, const SDLoc &DL, SelectionDAG &DAG) const;
 
-  static RISCVII::VLMUL getLMUL(MVT VT);
+  static RISCVVType::VLMUL getLMUL(MVT VT);
   inline static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize,
                                       unsigned MinSize) {
     // Original equation:
@@ -839,7 +839,7 @@ class RISCVTargetLowering : public TargetLowering {
   static std::pair<unsigned, unsigned>
   computeVLMAXBounds(MVT ContainerVT, const RISCVSubtarget &Subtarget);
 
-  static unsigned getRegClassIDForLMUL(RISCVII::VLMUL LMul);
+  static unsigned getRegClassIDForLMUL(RISCVVType::VLMUL LMul);
   static unsigned getSubregIndexByMVT(MVT VT, unsigned Index);
   static unsigned getRegClassIDForVecVT(MVT VT);
   static std::pair<unsigned, unsigned>
diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 4a74906ed3cc3..7433603daff85 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -342,7 +342,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const DemandedFields &DF) {
 }
 #endif
 
-static bool isLMUL1OrSmaller(RISCVII::VLMUL LMUL) {
+static bool isLMUL1OrSmaller(RISCVVType::VLMUL LMUL) {
   auto [LMul, Fractional] = RISCVVType::decodeVLMUL(LMUL);
   return Fractional || LMul == 1;
 }
@@ -564,7 +564,7 @@ class VSETVLIInfo {
   } State = Uninitialized;
 
   // Fields from VTYPE.
-  RISCVII::VLMUL VLMul = RISCVII::LMUL_1;
+  RISCVVType::VLMUL VLMul = RISCVVType::LMUL_1;
   uint8_t SEW = 0;
   uint8_t TailAgnostic : 1;
   uint8_t MaskAgnostic : 1;
@@ -642,7 +642,7 @@ class VSETVLIInfo {
   }
 
   unsigned getSEW() const { return SEW; }
-  RISCVII::VLMUL getVLMUL() const { return VLMul; }
+  RISCVVType::VLMUL getVLMUL() const { return VLMul; }
   bool getTailAgnostic() const { return TailAgnostic; }
   bool getMaskAgnostic() const { return MaskAgnostic; }
 
@@ -707,7 +707,7 @@ class VSETVLIInfo {
     TailAgnostic = RISCVVType::isTailAgnostic(VType);
     MaskAgnostic = RISCVVType::isMaskAgnostic(VType);
   }
-  void setVTYPE(RISCVII::VLMUL L, unsigned S, bool TA, bool MA) {
+  void setVTYPE(RISCVVType::VLMUL L, unsigned S, bool TA, bool MA) {
     assert(isValid() && !isUnknown() &&
            "Can't set VTYPE for uninitialized or unknown");
     VLMul = L;
@@ -716,7 +716,7 @@ class VSETVLIInfo {
     MaskAgnostic = MA;
   }
 
-  void setVLMul(RISCVII::VLMUL VLMul) { this->VLMul = VLMul; }
+  void setVLMul(RISCVVType::VLMUL VLMul) { this->VLMul = VLMul; }
 
   unsigned encodeVTYPE() const {
     assert(isValid() && !isUnknown() && !SEWLMULRatioOnly &&
@@ -1018,7 +1018,7 @@ RISCVInsertVSETVLI::getInfoForVSETVLI(const MachineInstr &MI) const {
 }
 
 static unsigned computeVLMAX(unsigned VLEN, unsigned SEW,
-                             RISCVII::VLMUL VLMul) {
+                             RISCVVType::VLMUL VLMul) {
   auto [LMul, Fractional] = RISCVVType::decodeVLMUL(VLMul);
   if (Fractional)
     VLEN = VLEN / LMul;
@@ -1043,22 +1043,18 @@ RISCVInsertVSETVLI::computeInfoForInstr(const MachineInstr &MI) const {
     if (RISCVII::hasVecPolicyOp(TSFlags)) {
       const MachineOperand &Op = MI.getOperand(MI.getNumExplicitOperands() - 1);
       uint64_t Policy = Op.getImm();
-      assert(Policy <= (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC) &&
+      assert(Policy <=
+                 (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC) &&
              "Invalid Policy Value");
-      TailAgnostic = Policy & RISCVII::TAIL_AGNOSTIC;
-      MaskAgnostic = Policy & RISCVII::MASK_AGNOSTIC;
+      TailAgnostic = Policy & RISCVVType::TAIL_AGNOSTIC;
+      MaskAgnostic = Policy & RISCVVType::MASK_AGNOSTIC;
     }
 
-    // Some pseudo instructions force a tail agnostic policy despite having a
-    // tied def.
-    if (RISCVII::doesForceTailAgnostic(TSFlags))
-      TailAgnostic = true;
-
     if (!RISCVII::usesMaskPolicy(TSFlags))
       MaskAgnostic = true;
   }
 
-  RISCVII::VLMUL VLMul = RISCVII::getLMul(TSFlags);
+  RISCVVType::VLMUL VLMul = RISCVII::getLMul(TSFlags);
 
   unsigned Log2SEW = MI.getOperand(getSEWOpNum(MI)).getImm();
   // A Log2SEW of 0 is an operation on mask registers only.
@@ -1250,8 +1246,7 @@ void RISCVInsertVSETVLI::transferBefore(VSETVLIInfo &Info,
     // be coalesced into another vsetvli since we won't demand any fields.
     VSETVLIInfo NewInfo; // Need a new VSETVLIInfo to clear SEWLMULRatioOnly
     NewInfo.setAVLImm(1);
-    NewInfo.setVTYPE(RISCVII::VLMUL::LMUL_1, /*sew*/ 8, /*ta*/ true,
-                     /*ma*/ true);
+    NewInfo.setVTYPE(RISCVVType::LMUL_1, /*sew*/ 8, /*ta*/ true, /*ma*/ true);
     Info = NewInfo;
     return;
   }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index cea28bdce284c..47fe51bafd17c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -193,36 +193,33 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   bits<3> VLMul = 0;
   let TSFlags{10-8} = VLMul;
 
-  bit ForceTailAgnostic = false;
-  let TSFlags{11} = ForceTailAgnostic;
-
   bit IsTiedPseudo = 0;
-  let TSFlags{12} = IsTiedPseudo;
+  let TSFlags{11} = IsTiedPseudo;
 
   bit HasSEWOp = 0;
-  let TSFlags{13} = HasSEWOp;
+  let TSFlags{12} = HasSEWOp;
 
   bit HasVLOp = 0;
-  let TSFlags{14} = HasVLOp;
+  let TSFlags{13} = HasVLOp;
 
   bit HasVecPolicyOp = 0;
-  let TSFlags{15} = HasVecPolicyOp;
+  let TSFlags{14} = HasVecPolicyOp;
 
   bit IsRVVWideningReduction = 0;
-  let TSFlags{16} = IsRVVWideningReduction;
+  let TSFlags{15} = IsRVVWideningReduction;
 
   bit UsesMaskPolicy = 0;
-  let TSFlags{17} = UsesMaskPolicy;
+  let TSFlags{16} = UsesMaskPolicy;
 
   // Indicates that the result can be considered sign extended from bit 31. Some
   // instructions with this flag aren't W instructions, but are either sign
   // extended from a smaller size, always outputs a small integer, or put zeros
   // in bits 63:31. Used by the SExtWRemoval pass.
   bit IsSignExtendingOpW = 0;
-  let TSFlags{18} = IsSignExtendingOpW;
+  let TSFlags{17} = IsSignExtendingOpW;
 
   bit HasRoundModeOp = 0;
-  let TSFlags{19} =  HasRoundModeOp;
+  let TSFlags{18} =  HasRoundModeOp;
 
   // This is only valid when HasRoundModeOp is set to 1. HasRoundModeOp is set
   // to 1 for vector fixed-point or floating-point intrinsics. This bit is
@@ -230,7 +227,7 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   // fixed-point / floating-point instructions and emit appropriate read/write
   // to the correct CSR.
   bit UsesVXRM = 0;
-  let TSFlags{20} =  UsesVXRM;
+  let TSFlags{19} =  UsesVXRM;
 
   // Indicates whether these instructions can partially overlap between source
   // registers and destination registers according to the vector spec.
@@ -239,19 +236,19 @@ class RVInstCommon<dag outs, dag ins, string opcodestr, string argstr,
   // 2 -> narrowing case
   // 3 -> widening case
   bits<2> TargetOverlapConstraintType = 0;
-  let TSFlags{22-21} = TargetOverlapConstraintType;
+  let TSFlags{21-20} = TargetOverlapConstraintType;
 
   // Most vector instructions are elementwise, but some may depend on the value
   // of VL (e.g. vslide1down.vx), and others may depend on the VL and mask
   // (e.g. vredsum.vs, viota.m). Mark these instructions so that peepholes avoid
   // changing their VL and/or mask.
   EltDeps ElementsDependOn = EltDepsNone;
-  let TSFlags{23} = ElementsDependOn.VL;
-  let TSFlags{24} = ElementsDependOn.Mask;
+  let TSFlags{22} = ElementsDependOn.VL;
+  let TSFlags{23} = ElementsDependOn.Mask;
 
   // Indicates the EEW of a vector instruction's destination operand.
   EEW DestEEW = EEWSEWx1;
-  let TSFlags{26-25} = DestEEW.Value;
+  let TSFlags{25-24} = DestEEW.Value;
 }
 
 class RVInst<dag outs, dag ins, string opcodestr, string argstr,
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 456fb66917216..8f7db34561749 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -193,7 +193,7 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI,
                                    const MachineBasicBlock &MBB,
                                    MachineBasicBlock::const_iterator MBBI,
                                    MachineBasicBlock::const_iterator &DefMBBI,
-                                   RISCVII::VLMUL LMul) {
+                                   RISCVVType::VLMUL LMul) {
   if (PreferWholeRegisterMove)
     return false;
 
@@ -223,7 +223,7 @@ static bool isConvertibleToVMV_V_V(const RISCVSubtarget &STI,
         if (!FirstVSetVLI) {
           FirstVSetVLI = true;
           unsigned FirstVType = MBBI->getOperand(2).getImm();
-          RISCVII::VLMUL FirstLMul = RISCVVType::getVLMUL(FirstVType);
+          RISCVVType::VLMUL FirstLMul = RISCVVType::getVLMUL(FirstVType);
           FirstSEW = RISCVVType::getSEW(FirstVType);
           // The first encountered vsetvli must have the same lmul as the
           // register class of COPY.
@@ -326,7 +326,7 @@ void RISCVInstrInfo::copyPhysRegVector(
     const DebugLoc &DL, MCRegister DstReg, MCRegister SrcReg, bool KillSrc,
     const TargetRegisterClass *RegClass) const {
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
-  RISCVII::VLMUL LMul = RISCVRI::getLMul(RegClass->TSFlags);
+  RISCVVType::VLMUL LMul = RISCVRI::getLMul(RegClass->TSFlags);
   unsigned NF = RISCVRI::getNF(RegClass->TSFlags);
 
   uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
@@ -345,7 +345,7 @@ void RISCVInstrInfo::copyPhysRegVector(
 
   unsigned I = 0;
   auto GetCopyInfo = [&](uint16_t SrcEncoding, uint16_t DstEncoding)
-      -> std::tuple<RISCVII::VLMUL, const TargetRegisterClass &, unsigned,
+      -> std::tuple<RISCVVType::VLMUL, const TargetRegisterClass &, unsigned,
                     unsigned, unsigned> {
     if (ReversedCopy) {
       // For reversed copying, if there are enough aligned registers(8/4/2), we
@@ -357,40 +357,40 @@ void RISCVInstrInfo::copyPhysRegVector(
       uint16_t Diff = DstEncoding - SrcEncoding;
       if (I + 8 <= NumRegs && Diff >= 8 && SrcEncoding % 8 == 7 &&
           DstEncoding % 8 == 7)
-        return {RISCVII::LMUL_8, RISCV::VRM8RegClass, RISCV::VMV8R_V,
+        return {RISCVVType::LMUL_8, RISCV::VRM8RegClass, RISCV::VMV8R_V,
                 RISCV::PseudoVMV_V_V_M8, RISCV::PseudoVMV_V_I_M8};
       if (I + 4 <= NumRegs && Diff >= 4 && SrcEncoding % 4 == 3 &&
           DstEncoding % 4 == 3)
-        return {RISCVII::LMUL_4, RISCV::VRM4RegClass, RISCV::VMV4R_V,
+        return {RISCVVType::LMUL_4, RISCV::VRM4RegClass, RISCV::VMV4R_V,
                 RISCV::PseudoVMV_V_V_M4, RISCV::PseudoVMV_V_I_M4};
       if (I + 2 <= NumRegs && Diff >= 2 && SrcEncoding % 2 == 1 &&
           DstEncoding % 2 == 1)
-        return {RISCVII::LMUL_2, RISCV::VRM2RegClass, RISCV::VMV2R_V,
+        return {RISCVVType::LMUL_2, RISCV::VRM2RegClass, RISCV::VMV2R_V,
                 RISCV::PseudoVMV_V_V_M2, RISCV::PseudoVMV_V_I_M2};
       // Or we should do LMUL1 copying.
-      return {RISCVII::LMUL_1, RISCV::VRRegClass, RISCV::VMV1R_V,
+      return {RISCVVType::LMUL_1, RISCV::VRRegClass, RISCV::VMV1R_V,
               RISCV::PseudoVMV_V_V_M1, RISCV::PseudoVMV_V_I_M1};
     }
 
     // For forward copying, if source register encoding and destination register
     // encoding are aligned to 8/4/2, we can do a LMUL8/4/2 copying.
     if (I + 8 <= NumRegs && SrcEncoding % 8 == 0 && DstEncoding % 8 == 0)
-      return {RISCVII::LMUL_8, RISCV::VRM8RegClass, RISCV::VMV8R_V,
+      return {RISCVVType::LMUL_8, RISCV::VRM8RegClass, RISCV::VMV8R_V,
               RISCV::PseudoVMV_V_V_M8, RISCV::PseudoVMV_V_I_M8};
     if (I + 4 <= NumRegs && SrcEncoding % 4 == 0 && DstEncoding % 4 == 0)
-      return {RISCVII::LMUL_4, RISCV::VRM4RegClass, RISCV::VMV4R_V,
+      return {RISCVVType::LMUL_4, RISCV::VRM4RegClass, RISCV::VMV4R_V,
               RISCV::PseudoVMV_V_V_M4, RISCV::PseudoVMV_V_I_M4};
     if (I + 2 <= NumRegs && SrcEncoding % 2 == 0 && DstEncoding % 2 == 0)
-      return {RISCVII::LMUL_2, RISCV::VRM2RegClass, RISCV::VMV2R_V,
+      return {RISCVVType::LMUL_2, RISCV::VRM2RegClass, RISCV::VMV2R_V,
               RISCV::PseudoVMV_V_V_M2, RISCV::PseudoVMV_V_I_M2};
     // Or we should do LMUL1 copying.
-    return {RISCVII::LMUL_1, RISCV::VRRegClass, RISCV::VMV1R_V,
+    return {RISCVVType::LMUL_1, RISCV::VRRegClass, RISCV::VMV1R_V,
             RISCV::PseudoVMV_V_V_M1, RISCV::PseudoVMV_V_I_M1};
   };
   auto FindRegWithEncoding = [TRI](const TargetRegisterClass &RegClass,
                                    uint16_t Encoding) {
     MCRegister Reg = RISCV::V0 + Encoding;
-    if (RISCVRI::getLMul(RegClass.TSFlags) == RISCVII::LMUL_1)
+    if (RISCVRI::getLMul(RegClass.TSFlags) == RISCVVType::LMUL_1)
       return Reg;
     return TRI->getMatchingSuperReg(Reg, RISCV::sub_vrm1_0, &RegClass);
   };
@@ -2580,7 +2580,8 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
           Ok = Imm >= 0 && Imm < RISCVCC::COND_INVALID;
           break;
         case RISCVOp::OPERAND_VEC_POLICY:
-          Ok = (Imm & (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC)) == Imm;
+          Ok = (Imm &
+                (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC)) == Imm;
           break;
         case RISCVOp::OPERAND_SEW:
           Ok = (isUInt<5>(Imm) && RISCVVType::isValidSEW(1 << Imm));
@@ -2648,7 +2649,7 @@ bool RISCVInstrInfo::verifyInstruction(const MachineInstr &MI,
       return false;
     }
     uint64_t Policy = MI.getOperand(OpIdx).getImm();
-    if (Policy > (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC)) {
+    if (Policy > (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC)) {
       ErrInfo = "Invalid Policy Value";
       return false;
     }
@@ -3234,10 +3235,10 @@ std::string RISCVInstrInfo::createMIROperandComment(
   }
   case RISCVOp::OPERAND_VEC_POLICY:
     unsigned Policy = Op.getImm();
-    assert(Policy <= (RISCVII::TAIL_AGNOSTIC | RISCVII::MASK_AGNOSTIC) &&
+    assert(Policy <= (RISCVVType::TAIL_AGNOSTIC | RISCVVType::MASK_AGNOSTIC) &&
            "Invalid Policy Value");
-    OS << (Policy & RISCVII::TAIL_AGNOSTIC ? "ta" : "tu") << ", "
-       << (Policy & RISCVII::MASK_AGNOSTIC ? "ma" : "mu");
+    OS << (Policy & RISCVVType::TAIL_AGNOSTIC ? "ta" : "tu") << ", "
+       << (Policy & RISCVVType::MASK_AGNOSTIC ? "ma" : "mu");
     break;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index 33c04d1c05613..cc58cdf02e09c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -1409,7 +1409,7 @@ class VPseudoTernaryMaskPolicyRoundingMode<VReg RetClass,
   let hasPostISelHook = 1;
 }
 
-// Like VPseudoBinaryMaskPolicy, but output can be V0 and there is no policy.
+// Like VPseudoBinaryMaskPolicy, but output can be V0.
 class VPseudoBinaryMOutMask<VReg RetClass,
                             RegisterClass Op1Class,
                             DAGOperand Op2Class,
@@ -1418,7 +1418,7 @@ class VPseudoBinaryMOutMask<VReg RetClass,
       Pseudo<(outs RetClass:$rd),
              (ins RetClass:$passthru,
                   Op1Class:$rs2, Op2Class:$rs1,
-                  VMaskOp:$vm, AVL:$vl, sew:$sew), []>,
+                  VMaskOp:$vm, AVL:$vl, sew:$sew, vec_policy:$policy), []>,
       RISCVVPseudo {
   let mayLoad = 0;
   let mayStore = 0;
@@ -1427,6 +1427,7 @@ class VPseudoBinaryMOutMask<VReg RetClass,
   let TargetOverlapConstraintType = TargetConstraintType;
   let HasVLOp = 1;
   let HasSEWOp = 1;
+  let HasVecPolicyOp = 1;
   let UsesMaskPolicy = 1;
 }
 
@@ -2622,7 +2623,6 @@ multiclass VPseudoBinaryM<DAGOperand Op2Class, LMULInfo m, bit Commutable = 0> {
       VPseudoBinaryNoMask<VR, m.vrclass, Op2Class,
                           !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""),
                           TargetConstraintType = 2>;
-    let ForceTailAgnostic = true in
     def "_" # m.MX # "_MASK" :
       VPseudoBinaryMOutMask<VR, m.vrclass, Op2Class,
                             !if(!ge(m.octuple, 16), "@earlyclobber $rd", ""),
@@ -4140,7 +4140,7 @@ class VPatBinaryMask<string intrinsic_name,
                    (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type VMV0:$vm), GPR:$vl, sew)>;
+                   (mask_type VMV0:$vm), GPR:$vl, sew, TA_MU)>;
 
 class VPatBinaryMaskPolicy<string intrinsic_name,
                            string inst,
@@ -4210,7 +4210,7 @@ class VPatBinaryMaskSwapped<string intrinsic_name,
                    (result_type result_reg_class:$passthru),
                    (op1_type op1_reg_class:$rs1),
                    (op2_type op2_kind:$rs2),
-                   (mask_type VMV0:$vm), GPR:$vl, sew)>;
+                   (mask_type VMV0:$vm), GPR:$vl, sew, TA_MU)>;
 
 class VPatTiedBinaryNoMask<string intrinsic_name,
                            string inst,
@@ -6013,7 +6013,7 @@ multiclass VPatCompare_VI<string intrinsic, string inst,
                                   (vti.Mask VMV0:$vm),
                                   VLOpFrag)),
               (PseudoMask VR:$passthru, vti.RegClass:$rs1, (DecImm ImmType:$rs2),
-                          (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
+                          (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MU)>;
   }
 }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
index ffa3d3982647d..43cfc9d1e77ca 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVVLPatterns.td
@@ -1020,7 +1020,7 @@ multiclass VPatIntegerSetCCVL_VV<VTypeInfo vti, string instruction_name,
                          VR:$passthru,
                          vti.RegClass:$rs1,
                          vti.RegClass:$rs2,
-                         (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
+                         (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW, TA_MU)>;
 }
 
 // Inherits from VPatIntegerSetCCVL_VV and adds a pattern with operands swapped.
@@ -1034,7 +1034,8 @@ multiclass VPatIntegerSetCCVL_VV_Swappable<VTypeInfo vti, string instruction_nam
                                       VLOpFrag)),
             (!cast<Instruction>(instruction_name#"_VV_"#vti.LMul.MX#"_MASK")
                          VR:$passthru, vti.RegClass:$rs1,
-                         vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
+                         vti.RegClass:$rs2, (vti.Mask VMV0:$vm), GPR:$vl,
+                         vti.Log2SEW, TA_MU)>;
 }
 
 multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_name,
@@ -1046,14 +1047,16 @@ multiclass VPatIntegerSetCCVL_VX_Swappable<VTypeInfo vti, string instruction_nam
                                       (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$passthru, vti.RegClass:$rs1,
-                                GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
+                                GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl,
+                                vti.Log2SEW, TA_MU)>;
   def : Pat<(vti.Mask (riscv_setcc_vl (SplatPat (XLenVT GPR:$rs2)),
                                       (vti.Vector vti.RegClass:$rs1), invcc,
                                       VR:$passthru,
                                       (vti.Mask VMV0:$vm),
                                       VLOpFrag)),
             (instruction_masked VR:$passthru, vti.RegClass:$rs1,
-                                GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl, vti.Log2SEW)>;
+                                GPR:$rs2, (vti.Mask VMV0:$vm), GPR:$vl,
+                                vti.Log2SEW, TA_MU)>;
 }
 
 multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_name,
@@ -1067,7 +1070,7 @@ multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_nam
                                       VLOpFrag)),
             (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 XLenVT:$rs2, (vti.Mask VMV0:$vm), GPR:$vl,
-                                vti.Log2SEW)>;
+                                vti.Log2SEW, TA_MU)>;
 
   // FIXME: Can do some canonicalization to remove these patterns.
   def : Pat<(vti.Mask (riscv_setcc_vl (splatpat_kind simm5:$rs2),
@@ -1077,7 +1080,7 @@ multiclass VPatIntegerSetCCVL_VI_Swappable<VTypeInfo vti, string instruction_nam
                                       VLOpFrag)),
             (instruction_masked VR:$passthru, vti.RegClass:$rs1,
                                 simm5:$rs2, (vti.Mask VMV0:$vm), GPR:$vl,
-                                vti.Log2SEW)>;
+                                vti.Log2SEW, TA_MU)>;
 }
 
 multiclass VPatFPSetCCVL_VV_VF_FV<SDPatternOperator vop, CondCode cc,
@@ -1094,7 +1097,7 @@ multiclass VPatFPSetCCVL_VV_VF_FV<SDPatternOperator vop, CondCode cc,
                 (!cast<Instruction>(inst_name#"_VV_"#fvti.LMul.MX#"_MASK")
                     VR:$passthru, fvti.RegClass:$rs1,
                     fvti.RegClass:$rs2, (fvti.Mask VMV0:$vm),
-                    GPR:$vl, fvti.Log2SEW)>;
+                    GPR:$vl, fvti.Log2SEW, TA_MU)>;
       def : Pat<(fvti.Mask (vop (fvti.Vector fvti.RegClass:$rs1),
                                 (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 cc,
@@ -1104,7 +1107,7 @@ multiclass VPatFPSetCCVL_VV_VF_FV<SDPatternOperator vop, CondCode cc,
                 (!cast<Instruction>(inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
                     VR:$passthru, fvti.RegClass:$rs1,
                     fvti.ScalarRegClass:$rs2, (fvti.Mask VMV0:$vm),
-                    GPR:$vl, fvti.Log2SEW)>;
+                    GPR:$vl, fvti.Log2SEW, TA_MU)>;
       def : Pat<(fvti.Mask (vop (SplatFPOp fvti.ScalarRegClass:$rs2),
                                 (fvti.Vector fvti.RegClass:$rs1),
                                 cc,
@@ -1114,7 +1117,7 @@ multiclass VPatFPSetCCVL_VV_VF_FV<SDPatternOperator vop, CondCode cc,
                 (!cast<Instruction>(swapped_op_inst_name#"_V"#fvti.ScalarSuffix#"_"#fvti.LMul.MX#"_MASK")
                     VR:$passthru, fvti.RegClass:$rs1,
                     fvti.ScalarRegClass:$rs2, (fvti.Mask VMV0:$vm),
-                    GPR:$vl, fvti.Log2SEW)>;
+                    GPR:$vl, fvti.Log2SEW, TA_MU)>;
     }
   }
 }
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 6c4e9c7b1bdc7..0830191dde3f4 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -43,8 +43,9 @@ static inline bool isVRegClass(uint64_t TSFlags) {
 }
 
 /// \returns the LMUL for the register class.
-static inline RISCVII::VLMUL getLMul(uint64_t TSFlags) {
-  return static_cast<RISCVII::VLMUL>((TSFlags & VLMulShiftMask) >> VLMulShift);
+static inline RISCVVType::VLMUL getLMul(uint64_t TSFlags) {
+  return static_cast<RISCVVType::VLMUL>((TSFlags & VLMulShiftMask) >>
+                                        VLMulShift);
 }
 
 /// \returns the NF for the register class.
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index da77bae18962c..79e3b9ee09744 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -765,9 +765,11 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 }
 
 static unsigned isM1OrSmaller(MVT VT) {
-  RISCVII::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
-  return (LMUL == RISCVII::VLMUL::LMUL_F8 || LMUL == RISCVII::VLMUL::LMUL_F4 ||
-          LMUL == RISCVII::VLMUL::LMUL_F2 || LMUL == RISCVII::VLMUL::LMUL_1);
+  RISCVVType::VLMUL LMUL = RISCVTargetLowering::getLMUL(VT);
+  return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
+          LMUL == RISCVVType::VLMUL::LMUL_F4 ||
+          LMUL == RISCVVType::VLMUL::LMUL_F2 ||
+          LMUL == RISCVVType::VLMUL::LMUL_1);
 }
 
 InstructionCost RISCVTTIImpl::getScalarizationOverhead(
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 1ba7f0b522a2b..e5a98598370ec 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -65,13 +65,13 @@ class RISCVVLOptimizer : public MachineFunctionPass {
 /// Represents the EMUL and EEW of a MachineOperand.
 struct OperandInfo {
   // Represent as 1,2,4,8, ... and fractional indicator. This is because
-  // EMUL can take on values that don't map to RISCVII::VLMUL values exactly.
+  // EMUL can take on values that don't map to RISCVVType::VLMUL values exactly.
   // For example, a mask operand can have an EMUL less than MF8.
   std::optional<std::pair<unsigned, bool>> EMUL;
 
   unsigned Log2EEW;
 
-  OperandInfo(RISCVII::VLMUL EMUL, unsigned Log2EEW)
+  OperandInfo(RISCVVType::VLMUL EMUL, unsigned Log2EEW)
       : EMUL(RISCVVType::decodeVLMUL(EMUL)), Log2EEW(Log2EEW) {}
 
   OperandInfo(std::pair<unsigned, bool> EMUL, unsigned Log2EEW)
@@ -141,7 +141,7 @@ static raw_ostream &operator<<(raw_ostream &OS,
 /// SEW are from the TSFlags of MI.
 static std::pair<unsigned, bool>
 getEMULEqualsEEWDivSEWTimesLMUL(unsigned Log2EEW, const MachineInstr &MI) {
-  RISCVII::VLMUL MIVLMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
+  RISCVVType::VLMUL MIVLMUL = RISCVII::getLMul(MI.getDesc().TSFlags);
   auto [MILMUL, MILMULIsFractional] = RISCVVType::decodeVLMUL(MIVLMUL);
   unsigned MILog2SEW =
       MI.getOperand(RISCVII::getSEWOpNum(MI.getDesc())).getImm();
diff --git a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
index 0bddbacc89e3e..ee90868d252e4 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorMaskDAGMutation.cpp
@@ -123,7 +123,7 @@ class RISCVVectorMaskDAGMutation : public ScheduleDAGMutation {
           // For LMUL=8 cases, there will be more possibilities to spill.
           // FIXME: We should use RegPressureTracker to do fine-grained
           // controls.
-          RISCVII::getLMul(MI->getDesc().TSFlags) != RISCVII::LMUL_8)
+          RISCVII::getLMul(MI->getDesc().TSFlags) != RISCVVType::LMUL_8)
         DAG->addEdge(&SU, SDep(NearestUseV0SU, SDep::Artificial));
     }
   }
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index a4e7219c39f37..7c05ff1f1a70e 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -371,7 +371,7 @@ bool RISCVVectorPeephole::convertAllOnesVMergeToVMv(MachineInstr &MI) const {
   MI.removeOperand(2); // False operand
   MI.removeOperand(3); // Mask operand
   MI.addOperand(
-      MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED));
+      MachineOperand::CreateImm(RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED));
 
   // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
   // register class for the destination and passthru operands e.g. VRNoV0 -> VR
@@ -438,7 +438,7 @@ bool RISCVVectorPeephole::convertSameMaskVMergeToVMv(MachineInstr &MI) {
   MI.removeOperand(2); // False operand
   MI.removeOperand(3); // Mask operand
   MI.addOperand(
-      MachineOperand::CreateImm(RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED));
+      MachineOperand::CreateImm(RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED));
 
   // vmv.v.v doesn't have a mask operand, so we may be able to inflate the
   // register class for the destination and passthru operands e.g. VRNoV0 -> VR
@@ -466,9 +466,9 @@ bool RISCVVectorPeephole::convertToUnmasked(MachineInstr &MI) const {
       RISCVII::hasVecPolicyOp(MCID.TSFlags);
   const bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(MCID);
   const MCInstrDesc &MaskedMCID = TII->get(MI.getOpcode());
-  assert(RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) ==
-             RISCVII::hasVecPolicyOp(MCID.TSFlags) &&
-         "Masked and unmasked pseudos are inconsistent");
+  assert((RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) ||
+          !RISCVII::hasVecPolicyOp(MCID.TSFlags)) &&
+         "Unmasked pseudo has policy but masked pseudo doesn't?");
   assert(HasPolicyOp == HasPassthru && "Unexpected pseudo structure");
   assert(!(HasPassthru && !RISCVII::isFirstDefTiedToFirstUse(MaskedMCID)) &&
          "Unmasked with passthru but masked with no passthru?");
@@ -476,6 +476,11 @@ bool RISCVVectorPeephole::convertToUnmasked(MachineInstr &MI) const {
 
   MI.setDesc(MCID);
 
+  // Drop the policy operand if unmasked doesn't need it.
+  if (RISCVII::hasVecPolicyOp(MaskedMCID.TSFlags) &&
+      !RISCVII::hasVecPolicyOp(MCID.TSFlags))
+    MI.removeOperand(RISCVII::getVecPolicyOpNum(MaskedMCID));
+
   // TODO: Increment all MaskOpIdxs in tablegen by num of explicit defs?
   unsigned MaskOpIdx = I->MaskOpIdx + MI.getNumExplicitDefs();
   MI.removeOperand(MaskOpIdx);
@@ -575,7 +580,7 @@ bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) {
         Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc()));
 
     if (RISCV::isVLKnownLE(MIVL, SrcVL))
-      SrcPolicy.setImm(SrcPolicy.getImm() | RISCVII::TAIL_AGNOSTIC);
+      SrcPolicy.setImm(SrcPolicy.getImm() | RISCVVType::TAIL_AGNOSTIC);
   }
 
   MRI->replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(2).getReg());
@@ -641,10 +646,10 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
   }
 
   // If MI was tail agnostic and the VL didn't increase, preserve it.
-  int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
-  if ((MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) &&
+  int64_t Policy = RISCVVType::TAIL_UNDISTURBED_MASK_UNDISTURBED;
+  if ((MI.getOperand(5).getImm() & RISCVVType::TAIL_AGNOSTIC) &&
       RISCV::isVLKnownLE(MI.getOperand(3), SrcVL))
-    Policy |= RISCVII::TAIL_AGNOSTIC;
+    Policy |= RISCVVType::TAIL_AGNOSTIC;
   Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy);
 
   MRI->replaceRegWith(MI.getOperand(0).getReg(), Src->getOperand(0).getReg());
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 039b8929c93a8..23de30275e2a1 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -3830,7 +3830,8 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
       .addConstantPoolIndex(CPI, 0, OpFlag);
     MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, MIMD,
                                       TII.get(Opc), ResultReg);
-    addRegReg(MIB, AddrReg, false, PICBase, false);
+    addRegReg(MIB, AddrReg, false, X86::NoSubRegister, PICBase, false,
+              X86::NoSubRegister);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getConstantPool(*FuncInfo.MF),
         MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 386d56dcda9de..696bb14292dd0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41629,23 +41629,28 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFHW:
   case X86ISD::PSHUFLW:
+  case X86ISD::VPERMV:
   case X86ISD::VPERMI:
   case X86ISD::VPERMILPI: {
-    if (N.getOperand(0).getValueType() == ShuffleVT &&
-        N->isOnlyUserOf(N.getOperand(0).getNode())) {
-      SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(0));
+    unsigned SrcIdx = Opc == X86ISD::VPERMV ? 1 : 0;
+    if (N.getOperand(SrcIdx).getValueType() == ShuffleVT &&
+        N->isOnlyUserOf(N.getOperand(SrcIdx).getNode())) {
+      SDValue N0 = peekThroughOneUseBitcasts(N.getOperand(SrcIdx));
       unsigned SrcOpcode = N0.getOpcode();
       EVT OpVT = N0.getValueType();
       if (TLI.isBinOp(SrcOpcode) && IsSafeToMoveShuffle(N0, SrcOpcode)) {
         SDValue Op00 = peekThroughOneUseBitcasts(N0.getOperand(0));
         SDValue Op01 = peekThroughOneUseBitcasts(N0.getOperand(1));
-        bool FoldShuf = Opc != X86ISD::VPERMI;
+        bool FoldShuf = Opc != X86ISD::VPERMI && Opc != X86ISD::VPERMV;
         if (IsMergeableWithShuffle(Op00, FoldShuf) ||
             IsMergeableWithShuffle(Op01, FoldShuf)) {
           SDValue LHS, RHS;
           Op00 = DAG.getBitcast(ShuffleVT, Op00);
           Op01 = DAG.getBitcast(ShuffleVT, Op01);
-          if (N.getNumOperands() == 2) {
+          if (Opc == X86ISD::VPERMV) {
+            LHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op00);
+            RHS = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Op01);
+          } else if (N.getNumOperands() == 2) {
             LHS = DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1));
             RHS = DAG.getNode(Opc, DL, ShuffleVT, Op01, N.getOperand(1));
           } else {
@@ -41661,11 +41666,13 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
       if (SrcOpcode == ISD::SINT_TO_FP && IsSafeToMoveShuffle(N0, SrcOpcode) &&
           OpVT.getScalarSizeInBits() ==
               N0.getOperand(0).getScalarValueSizeInBits()) {
-        SDValue Op00 = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
-        SDValue Res =
-            N.getNumOperands() == 2
-                ? DAG.getNode(Opc, DL, ShuffleVT, Op00, N.getOperand(1))
-                : DAG.getNode(Opc, DL, ShuffleVT, Op00);
+        SDValue Res = DAG.getBitcast(ShuffleVT, N0.getOperand(0));
+        if (Opc == X86ISD::VPERMV)
+          Res = DAG.getNode(Opc, DL, ShuffleVT, N.getOperand(0), Res);
+        else if (N.getNumOperands() == 2)
+          Res = DAG.getNode(Opc, DL, ShuffleVT, Res, N.getOperand(1));
+        else
+          Res = DAG.getNode(Opc, DL, ShuffleVT, Res);
         Res = DAG.getBitcast(N0.getOperand(0).getValueType(), Res);
         return DAG.getBitcast(ShuffleVT, DAG.getNode(SrcOpcode, DL, OpVT, Res));
       }
diff --git a/llvm/lib/Target/X86/X86InstrBuilder.h b/llvm/lib/Target/X86/X86InstrBuilder.h
index 07079ef87fd46..45c5f8aa82e97 100644
--- a/llvm/lib/Target/X86/X86InstrBuilder.h
+++ b/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -161,11 +161,14 @@ addRegOffset(const MachineInstrBuilder &MIB,
 
 /// addRegReg - This function is used to add a memory reference of the form:
 /// [Reg + Reg].
-static inline const MachineInstrBuilder &addRegReg(const MachineInstrBuilder &MIB,
-                                            unsigned Reg1, bool isKill1,
-                                            unsigned Reg2, bool isKill2) {
-  return MIB.addReg(Reg1, getKillRegState(isKill1)).addImm(1)
-    .addReg(Reg2, getKillRegState(isKill2)).addImm(0).addReg(0);
+static inline const MachineInstrBuilder &
+addRegReg(const MachineInstrBuilder &MIB, unsigned Reg1, bool isKill1,
+          unsigned SubReg1, unsigned Reg2, bool isKill2, unsigned SubReg2) {
+  return MIB.addReg(Reg1, getKillRegState(isKill1), SubReg1)
+      .addImm(1)
+      .addReg(Reg2, getKillRegState(isKill2), SubReg2)
+      .addImm(0)
+      .addReg(0);
 }
 
 static inline const MachineInstrBuilder &
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 44db5b6865c42..d756e73659a24 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1158,8 +1158,9 @@ static bool findRedundantFlagInstr(MachineInstr &CmpInstr,
 
 bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                                   unsigned Opc, bool AllowSP, Register &NewSrc,
-                                  bool &isKill, MachineOperand &ImplicitOp,
-                                  LiveVariables *LV, LiveIntervals *LIS) const {
+                                  unsigned &NewSrcSubReg, bool &isKill,
+                                  MachineOperand &ImplicitOp, LiveVariables *LV,
+                                  LiveIntervals *LIS) const {
   MachineFunction &MF = *MI.getParent()->getParent();
   const TargetRegisterClass *RC;
   if (AllowSP) {
@@ -1168,12 +1169,16 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     RC = Opc != X86::LEA32r ? &X86::GR64_NOSPRegClass : &X86::GR32_NOSPRegClass;
   }
   Register SrcReg = Src.getReg();
+  unsigned SubReg = Src.getSubReg();
   isKill = MI.killsRegister(SrcReg, /*TRI=*/nullptr);
 
+  NewSrcSubReg = X86::NoSubRegister;
+
   // For both LEA64 and LEA32 the register already has essentially the right
   // type (32-bit or 64-bit) we may just need to forbid SP.
   if (Opc != X86::LEA64_32r) {
     NewSrc = SrcReg;
+    NewSrcSubReg = SubReg;
     assert(!Src.isUndef() && "Undef op doesn't need optimization");
 
     if (NewSrc.isVirtual() && !MF.getRegInfo().constrainRegClass(NewSrc, RC))
@@ -1189,16 +1194,18 @@ bool X86InstrInfo::classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
     ImplicitOp.setImplicit();
 
     NewSrc = getX86SubSuperRegister(SrcReg, 64);
+    assert(!SubReg && "no superregister for source");
     assert(NewSrc.isValid() && "Invalid Operand");
     assert(!Src.isUndef() && "Undef op doesn't need optimization");
   } else {
     // Virtual register of the wrong class, we have to create a temporary 64-bit
     // vreg to feed into the LEA.
     NewSrc = MF.getRegInfo().createVirtualRegister(RC);
+    NewSrcSubReg = X86::NoSubRegister;
     MachineInstr *Copy =
         BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY))
             .addReg(NewSrc, RegState::Define | RegState::Undef, X86::sub_32bit)
-            .addReg(SrcReg, getKillRegState(isKill));
+            .addReg(SrcReg, getKillRegState(isKill), SubReg);
 
     // Which is obviously going to be dead after we're done with it.
     isKill = true;
@@ -1258,7 +1265,9 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   MachineBasicBlock::iterator MBBI = MI.getIterator();
   Register Dest = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
+  unsigned SrcSubReg = MI.getOperand(1).getSubReg();
   Register Src2;
+  unsigned Src2SubReg;
   bool IsDead = MI.getOperand(0).isDead();
   bool IsKill = MI.getOperand(1).isKill();
   unsigned SubReg = Is8BitOp ? X86::sub_8bit : X86::sub_16bit;
@@ -1268,7 +1277,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   MachineInstr *InsMI =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), get(TargetOpcode::COPY))
           .addReg(InRegLEA, RegState::Define, SubReg)
-          .addReg(Src, getKillRegState(IsKill));
+          .addReg(Src, getKillRegState(IsKill), SrcSubReg);
   MachineInstr *ImpDef2 = nullptr;
   MachineInstr *InsMI2 = nullptr;
 
@@ -1306,12 +1315,14 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
   case X86::ADD16rr:
   case X86::ADD16rr_DB: {
     Src2 = MI.getOperand(2).getReg();
+    Src2SubReg = MI.getOperand(2).getSubReg();
     bool IsKill2 = MI.getOperand(2).isKill();
     assert(!MI.getOperand(2).isUndef() && "Undef op doesn't need optimization");
     if (Src == Src2) {
       // ADD8rr/ADD16rr killed %reg1028, %reg1028
       // just a single insert_subreg.
-      addRegReg(MIB, InRegLEA, true, InRegLEA, false);
+      addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA, false,
+                X86::NoSubRegister);
     } else {
       if (Subtarget.is64Bit())
         InRegLEA2 = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
@@ -1323,8 +1334,9 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
                         InRegLEA2);
       InsMI2 = BuildMI(MBB, &*MIB, MI.getDebugLoc(), get(TargetOpcode::COPY))
                    .addReg(InRegLEA2, RegState::Define, SubReg)
-                   .addReg(Src2, getKillRegState(IsKill2));
-      addRegReg(MIB, InRegLEA, true, InRegLEA2, true);
+                   .addReg(Src2, getKillRegState(IsKill2), Src2SubReg);
+      addRegReg(MIB, InRegLEA, true, X86::NoSubRegister, InRegLEA2, true,
+                X86::NoSubRegister);
     }
     if (LV && IsKill2 && InsMI2)
       LV->replaceKillInstruction(Src2, MI, *InsMI2);
@@ -1428,6 +1440,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
 
   MachineInstr *NewMI = nullptr;
   Register SrcReg, SrcReg2;
+  unsigned SrcSubReg, SrcSubReg2;
   bool Is64Bit = Subtarget.is64Bit();
 
   bool Is8BitOp = false;
@@ -1467,17 +1480,18 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
     // LEA can't handle ESP.
     bool isKill;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
-                        ImplicitOp, LV, LIS))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                        isKill, ImplicitOp, LV, LIS))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .add(Dest)
-                                  .addReg(0)
-                                  .addImm(1LL << ShAmt)
-                                  .addReg(SrcReg, getKillRegState(isKill))
-                                  .addImm(0)
-                                  .addReg(0);
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc))
+            .add(Dest)
+            .addReg(0)
+            .addImm(1LL << ShAmt)
+            .addReg(SrcReg, getKillRegState(isKill), SrcSubReg)
+            .addImm(0)
+            .addReg(0);
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
     NewMI = MIB;
@@ -1505,8 +1519,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
                        : (Is64Bit ? X86::LEA64_32r : X86::LEA32r);
     bool isKill;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
-                        ImplicitOp, LV, LIS))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                        isKill, ImplicitOp, LV, LIS))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1531,8 +1545,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
 
     bool isKill;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, isKill,
-                        ImplicitOp, LV, LIS))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/false, SrcReg, SrcSubReg,
+                        isKill, ImplicitOp, LV, LIS))
       return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1569,8 +1583,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
     const MachineOperand &Src2 = MI.getOperand(2);
     bool isKill2;
     MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, isKill2,
-                        ImplicitOp2, LV, LIS))
+    if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/false, SrcReg2, SrcSubReg2,
+                        isKill2, ImplicitOp2, LV, LIS))
       return nullptr;
 
     bool isKill;
@@ -1580,9 +1594,10 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
       // the first call inserted a COPY from Src2 and marked it as killed.
       isKill = isKill2;
       SrcReg = SrcReg2;
+      SrcSubReg = SrcSubReg2;
     } else {
-      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
-                          ImplicitOp, LV, LIS))
+      if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
+                          isKill, ImplicitOp, LV, LIS))
         return nullptr;
     }
 
@@ -1592,7 +1607,8 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
     if (ImplicitOp2.getReg() != 0)
       MIB.add(ImplicitOp2);
 
-    NewMI = addRegReg(MIB, SrcReg, isKill, SrcReg2, isKill2);
+    NewMI =
+        addRegReg(MIB, SrcReg, isKill, SrcSubReg, SrcReg2, isKill2, SrcSubReg2);
 
     // Add kills if classifyLEAReg created a new register.
     if (LV) {
@@ -1625,13 +1641,14 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
 
     bool isKill;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
-                        ImplicitOp, LV, LIS))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
+                        isKill, ImplicitOp, LV, LIS))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .add(Dest)
-                                  .addReg(SrcReg, getKillRegState(isKill));
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc))
+            .add(Dest)
+            .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
 
@@ -1665,13 +1682,14 @@ MachineInstr *X86InstrInfo::convertToThreeAddress(MachineInstr &MI,
 
     bool isKill;
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
-    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, isKill,
-                        ImplicitOp, LV, LIS))
+    if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/true, SrcReg, SrcSubReg,
+                        isKill, ImplicitOp, LV, LIS))
       return nullptr;
 
-    MachineInstrBuilder MIB = BuildMI(MF, MI.getDebugLoc(), get(Opc))
-                                  .add(Dest)
-                                  .addReg(SrcReg, getKillRegState(isKill));
+    MachineInstrBuilder MIB =
+        BuildMI(MF, MI.getDebugLoc(), get(Opc))
+            .add(Dest)
+            .addReg(SrcReg, getKillRegState(isKill), SrcSubReg);
     if (ImplicitOp.getReg() != 0)
       MIB.add(ImplicitOp);
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 5f87e02fe67c4..e499f925f48ec 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -310,8 +310,9 @@ class X86InstrInfo final : public X86GenInstrInfo {
   /// operand to the LEA instruction.
   bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
                       unsigned LEAOpcode, bool AllowSP, Register &NewSrc,
-                      bool &isKill, MachineOperand &ImplicitOp,
-                      LiveVariables *LV, LiveIntervals *LIS) const;
+                      unsigned &NewSrcSubReg, bool &isKill,
+                      MachineOperand &ImplicitOp, LiveVariables *LV,
+                      LiveIntervals *LIS) const;
 
   /// convertToThreeAddress - This method must be implemented by targets that
   /// set the M_CONVERTIBLE_TO_3_ADDR flag.  When this flag is set, the target
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 31a93f9c2a6ef..c9e495c1eba1f 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -339,10 +339,10 @@ X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
   DenseMap<MachineInstr *, GraphIter> NodeMap;
   int FenceCount = 0, GadgetCount = 0;
   auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
-    auto Ref = NodeMap.find(MI);
-    if (Ref == NodeMap.end()) {
+    auto [Ref, Inserted] = NodeMap.try_emplace(MI);
+    if (Inserted) {
       auto I = Builder.addVertex(MI);
-      NodeMap[MI] = I;
+      Ref->second = I;
       return std::pair<GraphIter, bool>{I, true};
     }
     return std::pair<GraphIter, bool>{Ref->getSecond(), false};
diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp
index 625645a99e12f..4111f8bfd2662 100644
--- a/llvm/lib/TargetParser/RISCVTargetParser.cpp
+++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp
@@ -165,12 +165,12 @@ namespace RISCVVType {
 // 6    | vta        | Vector tail agnostic
 // 5:3  | vsew[2:0]  | Standard element width (SEW) setting
 // 2:0  | vlmul[2:0] | Vector register group multiplier (LMUL) setting
-unsigned encodeVTYPE(RISCVII::VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
+unsigned encodeVTYPE(VLMUL VLMul, unsigned SEW, bool TailAgnostic,
                      bool MaskAgnostic) {
   assert(isValidSEW(SEW) && "Invalid SEW");
-  unsigned VLMULBits = static_cast<unsigned>(VLMUL);
+  unsigned VLMulBits = static_cast<unsigned>(VLMul);
   unsigned VSEWBits = encodeSEW(SEW);
-  unsigned VTypeI = (VSEWBits << 3) | (VLMULBits & 0x7);
+  unsigned VTypeI = (VSEWBits << 3) | (VLMulBits & 0x7);
   if (TailAgnostic)
     VTypeI |= 0x40;
   if (MaskAgnostic)
@@ -179,19 +179,19 @@ unsigned encodeVTYPE(RISCVII::VLMUL VLMUL, unsigned SEW, bool TailAgnostic,
   return VTypeI;
 }
 
-std::pair<unsigned, bool> decodeVLMUL(RISCVII::VLMUL VLMUL) {
-  switch (VLMUL) {
+std::pair<unsigned, bool> decodeVLMUL(VLMUL VLMul) {
+  switch (VLMul) {
   default:
     llvm_unreachable("Unexpected LMUL value!");
-  case RISCVII::VLMUL::LMUL_1:
-  case RISCVII::VLMUL::LMUL_2:
-  case RISCVII::VLMUL::LMUL_4:
-  case RISCVII::VLMUL::LMUL_8:
-    return std::make_pair(1 << static_cast<unsigned>(VLMUL), false);
-  case RISCVII::VLMUL::LMUL_F2:
-  case RISCVII::VLMUL::LMUL_F4:
-  case RISCVII::VLMUL::LMUL_F8:
-    return std::make_pair(1 << (8 - static_cast<unsigned>(VLMUL)), true);
+  case LMUL_1:
+  case LMUL_2:
+  case LMUL_4:
+  case LMUL_8:
+    return std::make_pair(1 << static_cast<unsigned>(VLMul), false);
+  case LMUL_F2:
+  case LMUL_F4:
+  case LMUL_F8:
+    return std::make_pair(1 << (8 - static_cast<unsigned>(VLMul)), true);
   }
 }
 
@@ -220,7 +220,7 @@ void printVType(unsigned VType, raw_ostream &OS) {
     OS << ", mu";
 }
 
-unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul) {
+unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul) {
   unsigned LMul;
   bool Fractional;
   std::tie(LMul, Fractional) = decodeVLMUL(VLMul);
@@ -232,9 +232,8 @@ unsigned getSEWLMULRatio(unsigned SEW, RISCVII::VLMUL VLMul) {
   return (SEW * 8) / LMul;
 }
 
-std::optional<RISCVII::VLMUL>
-getSameRatioLMUL(unsigned SEW, RISCVII::VLMUL VLMUL, unsigned EEW) {
-  unsigned Ratio = RISCVVType::getSEWLMULRatio(SEW, VLMUL);
+std::optional<VLMUL> getSameRatioLMUL(unsigned SEW, VLMUL VLMul, unsigned EEW) {
+  unsigned Ratio = RISCVVType::getSEWLMULRatio(SEW, VLMul);
   unsigned EMULFixedPoint = (EEW * 8) / Ratio;
   bool Fractional = EMULFixedPoint < 8;
   unsigned EMUL = Fractional ? 8 / EMULFixedPoint : EMULFixedPoint / 8;
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 20fc630a74a86..921fe8c18aa72 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -126,7 +126,7 @@ struct AlwaysInlinerLegacyPass : public ModulePass {
     initializeAlwaysInlinerLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
-  /// Main run interface method.  We override here to avoid calling skipSCC().
+  /// Main run interface method.
   bool runOnModule(Module &M) override {
 
     auto &PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 38454053b039e..a1649c276de83 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -50,6 +50,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/AttributeMask.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRangeList.h"
@@ -563,6 +564,43 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   for_each(LinkedDVRAssigns, InsertAssignForOverlap);
 }
 
+/// Update the attributes given that a memory access is updated (the
+/// dereferenced pointer could be moved forward when shortening a
+/// mem intrinsic).
+static void adjustArgAttributes(AnyMemIntrinsic *Intrinsic, unsigned ArgNo,
+                                uint64_t PtrOffset) {
+  // Remember old attributes.
+  AttributeSet OldAttrs = Intrinsic->getParamAttributes(ArgNo);
+
+  // Find attributes that should be kept, and remove the rest.
+  AttributeMask AttrsToRemove;
+  for (auto &Attr : OldAttrs) {
+    if (Attr.hasKindAsEnum()) {
+      switch (Attr.getKindAsEnum()) {
+      default:
+        break;
+      case Attribute::Alignment:
+        // Only keep alignment if PtrOffset satisfy the alignment.
+        if (isAligned(Attr.getAlignment().valueOrOne(), PtrOffset))
+          continue;
+        break;
+      case Attribute::Dereferenceable:
+      case Attribute::DereferenceableOrNull:
+        // We could reduce the size of these attributes according to
+        // PtrOffset. But we simply drop these for now.
+        break;
+      case Attribute::NonNull:
+      case Attribute::NoUndef:
+        continue;
+      }
+    }
+    AttrsToRemove.addAttribute(Attr);
+  }
+
+  // Remove the attributes that should be dropped.
+  Intrinsic->removeParamAttrs(ArgNo, AttrsToRemove);
+}
+
 static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
                          uint64_t &DeadSize, int64_t KillingStart,
                          uint64_t KillingSize, bool IsOverwriteEnd) {
@@ -644,6 +682,7 @@ static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
         DeadI->getIterator());
     NewDestGEP->setDebugLoc(DeadIntrinsic->getDebugLoc());
     DeadIntrinsic->setDest(NewDestGEP);
+    adjustArgAttributes(DeadIntrinsic, 0, ToRemoveSize);
   }
 
   // Update attached dbg.assign intrinsics. Assume 8-bit byte.
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 872e055294d55..676d23e1ebdf0 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -9,6 +9,7 @@ add_llvm_component_library(LLVMVectorize
   SandboxVectorizer/Legality.cpp
   SandboxVectorizer/Passes/BottomUpVec.cpp
   SandboxVectorizer/Passes/RegionsFromMetadata.cpp
+  SandboxVectorizer/Passes/SeedCollection.cpp
   SandboxVectorizer/Passes/TransactionAcceptOrRevert.cpp
   SandboxVectorizer/SandboxVectorizer.cpp
   SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8c41f896ad622..e8a5db28ea0a4 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9298,6 +9298,7 @@ static void addExitUsersForFirstOrderRecurrences(
 VPlanPtr
 LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
 
+  using namespace llvm::VPlanPatternMatch;
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
   // ---------------------------------------------------------------------------
@@ -9321,6 +9322,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
                                             PSE, RequiresScalarEpilogueCheck,
                                             CM.foldTailByMasking(), OrigLoop);
 
+  // Build hierarchical CFG.
+  VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
+  HCFGBuilder.buildHierarchicalCFG();
+
   // Don't use getDecisionAndClampRange here, because we don't know the UF
   // so this function is better to be conservative, rather than to split
   // it up into different VPlans.
@@ -9371,13 +9376,8 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   // Construct recipes for the instructions in the loop
   // ---------------------------------------------------------------------------
 
-  // Scan the body of the loop in a topological order to visit each basic block
-  // after having visited its predecessor basic blocks.
-  LoopBlocksDFS DFS(OrigLoop);
-  DFS.perform(LI);
-
-  VPBasicBlock *HeaderVPBB = Plan->getVectorLoopRegion()->getEntryBasicBlock();
-  VPBasicBlock *VPBB = HeaderVPBB;
+  VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
+  VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
   BasicBlock *HeaderBB = OrigLoop->getHeader();
   bool NeedsMasks =
       CM.foldTailByMasking() ||
@@ -9389,26 +9389,70 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
   RecipeBuilder.collectScaledReductions(Range);
 
   auto *MiddleVPBB = Plan->getMiddleBlock();
+
+  // Scan the body of the loop in a topological order to visit each basic block
+  // after having visited its predecessor basic blocks.
+  ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
+      HeaderVPBB);
+
   VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
-  for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
-    // Relevant instructions from basic block BB will be grouped into VPRecipe
-    // ingredients and fill a new VPBasicBlock.
-    if (VPBB != HeaderVPBB)
-      VPBB->setName(BB->getName());
-    Builder.setInsertPoint(VPBB);
+  VPBlockBase *PrevVPBB = nullptr;
+  for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
+    // Handle VPBBs down to the latch.
+    if (VPBB == LoopRegion->getExiting()) {
+      assert(!HCFGBuilder.getIRBBForVPB(VPBB) &&
+             "the latch block shouldn't have a corresponding IRBB");
+      VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+      break;
+    }
 
-    if (VPBB == HeaderVPBB)
+    // Create mask based on the IR BB corresponding to VPBB.
+    // TODO: Predicate directly based on VPlan.
+    Builder.setInsertPoint(VPBB, VPBB->begin());
+    if (VPBB == HeaderVPBB) {
+      Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
       RecipeBuilder.createHeaderMask();
-    else if (NeedsMasks)
-      RecipeBuilder.createBlockInMask(BB);
+    } else if (NeedsMasks) {
+      // FIXME: At the moment, masks need to be placed at the beginning of the
+      // block, as blends introduced for phi nodes need to use it. The created
+      // blends should be sunk after the mask recipes.
+      RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
+    }
+
+    // Convert input VPInstructions to widened recipes.
+    for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
+      auto *SingleDef = cast<VPSingleDefRecipe>(&R);
+      auto *UnderlyingValue = SingleDef->getUnderlyingValue();
+      // Skip recipes that do not need transforming, including canonical IV,
+      // wide canonical IV and VPInstructions without underlying values. The
+      // latter are added above for masking.
+      // FIXME: Migrate code relying on the underlying instruction from VPlan0
+      // to construct recipes below to not use the underlying instruction.
+      if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe>(&R) ||
+          (isa<VPInstruction>(&R) && !UnderlyingValue))
+        continue;
+
+      // FIXME: VPlan0, which models a copy of the original scalar loop, should
+      // not use VPWidenPHIRecipe to model the phis.
+      assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
+             UnderlyingValue && "unsupported recipe");
 
-    // Introduce each ingredient into VPlan.
-    // TODO: Model and preserve debug intrinsics in VPlan.
-    for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
-      Instruction *Instr = &I;
+      if (isa<VPInstruction>(&R) &&
+          (cast<VPInstruction>(&R)->getOpcode() ==
+               VPInstruction::BranchOnCond ||
+           (cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch))) {
+        R.eraseFromParent();
+        break;
+      }
+
+      // TODO: Gradually replace uses of underlying instruction by analyses on
+      // VPlan.
+      Instruction *Instr = cast<Instruction>(UnderlyingValue);
+      Builder.setInsertPoint(SingleDef);
       SmallVector<VPValue *, 4> Operands;
       auto *Phi = dyn_cast<PHINode>(Instr);
       if (Phi && Phi->getParent() == HeaderBB) {
+        // The backedge value will be added in fixHeaderPhis later.
         Operands.push_back(Plan->getOrAddLiveIn(
             Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader())));
       } else {
@@ -9420,15 +9464,16 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
       // in the exit block, a uniform store recipe will be created for the final
       // invariant store of the reduction.
       StoreInst *SI;
-      if ((SI = dyn_cast<StoreInst>(&I)) &&
+      if ((SI = dyn_cast<StoreInst>(Instr)) &&
           Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
         // Only create recipe for the final invariant store of the reduction.
-        if (!Legal->isInvariantStoreOfReduction(SI))
-          continue;
-        auto *Recipe = new VPReplicateRecipe(
-            SI, make_range(Operands.begin(), Operands.end()),
-            true /* IsUniform */);
-        Recipe->insertBefore(*MiddleVPBB, MBIP);
+        if (Legal->isInvariantStoreOfReduction(SI)) {
+          auto *Recipe = new VPReplicateRecipe(
+              SI, make_range(Operands.begin(), Operands.end()),
+              true /* IsUniform */);
+          Recipe->insertBefore(*MiddleVPBB, MBIP);
+        }
+        R.eraseFromParent();
         continue;
       }
 
@@ -9438,30 +9483,31 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
         Recipe = RecipeBuilder.handleReplication(Instr, Operands, Range);
 
       RecipeBuilder.setRecipe(Instr, Recipe);
-      if (isa<VPHeaderPHIRecipe>(Recipe)) {
-        // VPHeaderPHIRecipes must be kept in the phi section of HeaderVPBB. In
-        // the following cases, VPHeaderPHIRecipes may be created after non-phi
-        // recipes and need to be moved to the phi section of HeaderVPBB:
-        // * tail-folding (non-phi recipes computing the header mask are
-        // introduced earlier than regular header phi recipes, and should appear
-        // after them)
-        // * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
-
-        assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
-                CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
-               "unexpected recipe needs moving");
+      if (isa<VPWidenIntOrFpInductionRecipe>(Recipe) && isa<TruncInst>(Instr)) {
+        // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
+        // moved to the phi section in the header.
         Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
-      } else
-        VPBB->appendRecipe(Recipe);
+      } else {
+        Builder.insert(Recipe);
+      }
+      if (Recipe->getNumDefinedValues() == 1)
+        SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
+      else
+        assert(Recipe->getNumDefinedValues() == 0 &&
+               "Unexpected multidef recipe");
+      R.eraseFromParent();
     }
 
-    VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
-    VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
+    // Flatten the CFG in the loop. Masks for blocks have already been generated
+    // and added to recipes as needed. To do so, first disconnect VPBB from its
+    // successors. Then connect VPBB to the previously visited VPBB.
+    for (auto *Succ : to_vector(VPBB->getSuccessors()))
+      VPBlockUtils::disconnectBlocks(VPBB, Succ);
+    if (PrevVPBB)
+      VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
+    PrevVPBB = VPBB;
   }
 
-  // After here, VPBB should not be used.
-  VPBB = nullptr;
-
   assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
          !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
          "entry block must be set to a VPRegionBlock having a non-empty entry "
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index e946620406c2e..f2aa0e8328585 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -5724,6 +5724,10 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
 
     auto CompareByBasicBlocks = [&](BasicBlock *BB1, BasicBlock *BB2) {
       assert(BB1 != BB2 && "Expected different basic blocks.");
+      if (!DT->isReachableFromEntry(BB1))
+        return false;
+      if (!DT->isReachableFromEntry(BB2))
+        return true;
       auto *NodeA = DT->getNode(BB1);
       auto *NodeB = DT->getNode(BB2);
       assert(NodeA && "Should only process reachable instructions");
@@ -12130,6 +12134,30 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const {
       }))
     return true;
 
+  // Do not vectorize small tree of phis only, if all vector phis are also
+  // gathered.
+  if (!ForReduction && SLPCostThreshold.getNumOccurrences() &&
+      VectorizableTree.size() <= Limit &&
+      all_of(VectorizableTree,
+             [&](const std::unique_ptr<TreeEntry> &TE) {
+               return (TE->isGather() &&
+                       (!TE->hasState() ||
+                        TE->getOpcode() != Instruction::ExtractElement) &&
+                       count_if(TE->Scalars, IsaPred<ExtractElementInst>) <=
+                           Limit) ||
+                      (TE->hasState() &&
+                       (TE->getOpcode() == Instruction::InsertElement ||
+                        (TE->getOpcode() == Instruction::PHI &&
+                         all_of(TE->Scalars, [&](Value *V) {
+                           return isa<PoisonValue>(V) || MustGather.contains(V);
+                         }))));
+             }) &&
+      any_of(VectorizableTree, [&](const std::unique_ptr<TreeEntry> &TE) {
+        return TE->State == TreeEntry::Vectorize &&
+               TE->getOpcode() == Instruction::PHI;
+      }))
+    return true;
+
   // We can vectorize the tree if its size is greater than or equal to the
   // minimum size specified by the MinTreeSize command line option.
   if (VectorizableTree.size() >= MinTreeSize)
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
index c9a6098860c10..3da52b5b4a6f1 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.cpp
@@ -483,19 +483,37 @@ void DependencyGraph::notifyEraseInstr(Instruction *I) {
   if (Ctx->getTracker().getState() == Tracker::TrackerState::Reverting)
     // We don't maintain the DAG while reverting.
     return;
-  // Update the MemDGNode chain if this is a memory node.
-  if (auto *MemN = dyn_cast_or_null<MemDGNode>(getNodeOrNull(I))) {
+  auto *N = getNode(I);
+  if (N == nullptr)
+    // Early return if there is no DAG node for `I`.
+    return;
+  if (auto *MemN = dyn_cast<MemDGNode>(getNode(I))) {
+    // Update the MemDGNode chain if this is a memory node.
     auto *PrevMemN = getMemDGNodeBefore(MemN, /*IncludingN=*/false);
     auto *NextMemN = getMemDGNodeAfter(MemN, /*IncludingN=*/false);
     if (PrevMemN != nullptr)
       PrevMemN->NextMemN = NextMemN;
     if (NextMemN != nullptr)
       NextMemN->PrevMemN = PrevMemN;
-  }
 
+    // Drop the memory dependencies from both predecessors and successors.
+    while (!MemN->memPreds().empty()) {
+      auto *PredN = *MemN->memPreds().begin();
+      MemN->removeMemPred(PredN);
+    }
+    while (!MemN->memSuccs().empty()) {
+      auto *SuccN = *MemN->memSuccs().begin();
+      SuccN->removeMemPred(MemN);
+    }
+    // NOTE: The unscheduled succs for MemNodes get updated be setMemPred().
+  } else {
+    // If this is a non-mem node we only need to update UnscheduledSuccs.
+    if (!N->scheduled())
+      for (auto *PredN : N->preds(*this))
+        PredN->decrUnscheduledSuccs();
+  }
+  // Finally erase the Node.
   InstrToNodeMap.erase(I);
-
-  // TODO: Update the dependencies.
 }
 
 void DependencyGraph::notifySetUse(const Use &U, Value *NewSrc) {
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 0ccef5aecd28b..d57732090dcd6 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -14,20 +14,10 @@
 #include "llvm/SandboxIR/Module.h"
 #include "llvm/SandboxIR/Region.h"
 #include "llvm/SandboxIR/Utils.h"
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h"
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
 
 namespace llvm {
 
-static cl::opt<unsigned>
-    OverrideVecRegBits("sbvec-vec-reg-bits", cl::init(0), cl::Hidden,
-                       cl::desc("Override the vector register size in bits, "
-                                "which is otherwise found by querying TTI."));
-static cl::opt<bool>
-    AllowNonPow2("sbvec-allow-non-pow2", cl::init(false), cl::Hidden,
-                 cl::desc("Allow non-power-of-2 vectorization."));
-
 #ifndef NDEBUG
 static cl::opt<bool>
     AlwaysVerify("sbvec-always-verify", cl::init(false), cl::Hidden,
@@ -37,10 +27,6 @@ static cl::opt<bool>
 
 namespace sandboxir {
 
-BottomUpVec::BottomUpVec(StringRef Pipeline)
-    : FunctionPass("bottom-up-vec"),
-      RPM("rpm", Pipeline, SandboxVectorizerPassBuilder::createRegionPass) {}
-
 static SmallVector<Value *, 4> getOperand(ArrayRef<Value *> Bndl,
                                           unsigned OpIdx) {
   SmallVector<Value *, 4> Operands;
@@ -413,6 +399,7 @@ Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl,
 }
 
 bool BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) {
+  Change = false;
   DeadInstrCandidates.clear();
   Legality->clear();
   vectorizeRec(Bndl, {}, /*Depth=*/0);
@@ -420,83 +407,21 @@ bool BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) {
   return Change;
 }
 
-bool BottomUpVec::runOnFunction(Function &F, const Analyses &A) {
+bool BottomUpVec::runOnRegion(Region &Rgn, const Analyses &A) {
+  const auto &SeedSlice = Rgn.getAux();
+  assert(SeedSlice.size() >= 2 && "Bad slice!");
+  Function &F = *SeedSlice[0]->getParent()->getParent();
   IMaps = std::make_unique<InstrMaps>(F.getContext());
   Legality = std::make_unique<LegalityAnalysis>(
       A.getAA(), A.getScalarEvolution(), F.getParent()->getDataLayout(),
       F.getContext(), *IMaps);
-  Change = false;
-  const auto &DL = F.getParent()->getDataLayout();
-  unsigned VecRegBits =
-      OverrideVecRegBits != 0
-          ? OverrideVecRegBits
-          : A.getTTI()
-                .getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
-                .getFixedValue();
-
-  // TODO: Start from innermost BBs first
-  for (auto &BB : F) {
-    SeedCollector SC(&BB, A.getScalarEvolution());
-    for (SeedBundle &Seeds : SC.getStoreSeeds()) {
-      unsigned ElmBits =
-          Utils::getNumBits(VecUtils::getElementType(Utils::getExpectedType(
-                                Seeds[Seeds.getFirstUnusedElementIdx()])),
-                            DL);
-
-      auto DivideBy2 = [](unsigned Num) {
-        auto Floor = VecUtils::getFloorPowerOf2(Num);
-        if (Floor == Num)
-          return Floor / 2;
-        return Floor;
-      };
-      // Try to create the largest vector supported by the target. If it fails
-      // reduce the vector size by half.
-      for (unsigned SliceElms = std::min(VecRegBits / ElmBits,
-                                         Seeds.getNumUnusedBits() / ElmBits);
-           SliceElms >= 2u; SliceElms = DivideBy2(SliceElms)) {
-        if (Seeds.allUsed())
-          break;
-        // Keep trying offsets after FirstUnusedElementIdx, until we vectorize
-        // the slice. This could be quite expensive, so we enforce a limit.
-        for (unsigned Offset = Seeds.getFirstUnusedElementIdx(),
-                      OE = Seeds.size();
-             Offset + 1 < OE; Offset += 1) {
-          // Seeds are getting used as we vectorize, so skip them.
-          if (Seeds.isUsed(Offset))
-            continue;
-          if (Seeds.allUsed())
-            break;
 
-          auto SeedSlice =
-              Seeds.getSlice(Offset, SliceElms * ElmBits, !AllowNonPow2);
-          if (SeedSlice.empty())
-            continue;
-
-          assert(SeedSlice.size() >= 2 && "Should have been rejected!");
-
-          // TODO: Refactor to remove the unnecessary copy to SeedSliceVals.
-          SmallVector<Value *> SeedSliceVals(SeedSlice.begin(),
-                                             SeedSlice.end());
-          // Create an empty region. Instructions get added to the region
-          // automatically by the callbacks.
-          auto &Ctx = F.getContext();
-          Region Rgn(Ctx, A.getTTI());
-          // Save the state of the IR before we make any changes. The
-          // transaction gets accepted/reverted by the tr-accept-or-revert pass.
-          Ctx.save();
-          // Try to vectorize starting from the seed slice. The returned value
-          // is true if we found vectorizable code and generated some vector
-          // code for it. It does not mean that the code is profitable.
-          bool VecSuccess = tryVectorize(SeedSliceVals);
-          if (VecSuccess)
-            // WARNING: All passes should return false, except those that
-            // accept/revert the state.
-            Change |= RPM.runOnRegion(Rgn, A);
-        }
-      }
-    }
-  }
-  return Change;
+  // TODO: Refactor to remove the unnecessary copy to SeedSliceVals.
+  SmallVector<Value *> SeedSliceVals(SeedSlice.begin(), SeedSlice.end());
+  // Try to vectorize starting from the seed slice. The returned value
+  // is true if we found vectorizable code and generated some vector
+  // code for it. It does not mean that the code is profitable.
+  return tryVectorize(SeedSliceVals);
 }
 
 } // namespace sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def
index f3aa12729860f..722c6f5db4192 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def
@@ -21,6 +21,7 @@ REGION_PASS("null", ::llvm::sandboxir::NullPass)
 REGION_PASS("print-instruction-count", ::llvm::sandboxir::PrintInstructionCount)
 REGION_PASS("tr-accept", ::llvm::sandboxir::TransactionAlwaysAccept)
 REGION_PASS("tr-accept-or-revert", ::llvm::sandboxir::TransactionAcceptOrRevert)
+REGION_PASS("bottom-up-vec", ::llvm::sandboxir::BottomUpVec)
 
 #undef REGION_PASS
 
@@ -28,7 +29,7 @@ REGION_PASS("tr-accept-or-revert", ::llvm::sandboxir::TransactionAcceptOrRevert)
 #define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS_NAME)
 #endif
 
-FUNCTION_PASS_WITH_PARAMS("bottom-up-vec", ::llvm::sandboxir::BottomUpVec)
+FUNCTION_PASS_WITH_PARAMS("seed-collection", ::llvm::sandboxir::SeedCollection)
 FUNCTION_PASS_WITH_PARAMS("regions-from-metadata", ::llvm::sandboxir::RegionsFromMetadata)
 
 #undef FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.cpp
new file mode 100644
index 0000000000000..0001c9bb7c7e4
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.cpp
@@ -0,0 +1,98 @@
+//===- SeedCollection.cpp - Seed collection pass --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/SandboxIR/Module.h"
+#include "llvm/SandboxIR/Region.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/VecUtils.h"
+
+namespace llvm {
+
+static cl::opt<unsigned>
+    OverrideVecRegBits("sbvec-vec-reg-bits", cl::init(0), cl::Hidden,
+                       cl::desc("Override the vector register size in bits, "
+                                "which is otherwise found by querying TTI."));
+static cl::opt<bool>
+    AllowNonPow2("sbvec-allow-non-pow2", cl::init(false), cl::Hidden,
+                 cl::desc("Allow non-power-of-2 vectorization."));
+
+namespace sandboxir {
+SeedCollection::SeedCollection(StringRef Pipeline)
+    : FunctionPass("seed-collection"),
+      RPM("rpm", Pipeline, SandboxVectorizerPassBuilder::createRegionPass) {}
+
+bool SeedCollection::runOnFunction(Function &F, const Analyses &A) {
+  bool Change = false;
+  const auto &DL = F.getParent()->getDataLayout();
+  unsigned VecRegBits =
+      OverrideVecRegBits != 0
+          ? OverrideVecRegBits
+          : A.getTTI()
+                .getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector)
+                .getFixedValue();
+
+  // TODO: Start from innermost BBs first
+  for (auto &BB : F) {
+    SeedCollector SC(&BB, A.getScalarEvolution());
+    for (SeedBundle &Seeds : SC.getStoreSeeds()) {
+      unsigned ElmBits =
+          Utils::getNumBits(VecUtils::getElementType(Utils::getExpectedType(
+                                Seeds[Seeds.getFirstUnusedElementIdx()])),
+                            DL);
+
+      auto DivideBy2 = [](unsigned Num) {
+        auto Floor = VecUtils::getFloorPowerOf2(Num);
+        if (Floor == Num)
+          return Floor / 2;
+        return Floor;
+      };
+      // Try to create the largest vector supported by the target. If it fails
+      // reduce the vector size by half.
+      for (unsigned SliceElms = std::min(VecRegBits / ElmBits,
+                                         Seeds.getNumUnusedBits() / ElmBits);
+           SliceElms >= 2u; SliceElms = DivideBy2(SliceElms)) {
+        if (Seeds.allUsed())
+          break;
+        // Keep trying offsets after FirstUnusedElementIdx, until we vectorize
+        // the slice. This could be quite expensive, so we enforce a limit.
+        for (unsigned Offset = Seeds.getFirstUnusedElementIdx(),
+                      OE = Seeds.size();
+             Offset + 1 < OE; Offset += 1) {
+          // Seeds are getting used as we vectorize, so skip them.
+          if (Seeds.isUsed(Offset))
+            continue;
+          if (Seeds.allUsed())
+            break;
+
+          auto SeedSlice =
+              Seeds.getSlice(Offset, SliceElms * ElmBits, !AllowNonPow2);
+          if (SeedSlice.empty())
+            continue;
+
+          assert(SeedSlice.size() >= 2 && "Should have been rejected!");
+
+          // Create a region containing the seed slice.
+          auto &Ctx = F.getContext();
+          Region Rgn(Ctx, A.getTTI());
+          // TODO: Replace save() with a save pass in the pass pipeline.
+          Ctx.save();
+          Rgn.setAux(SeedSlice);
+          // Run the region pass pipeline.
+          Change |= RPM.runOnRegion(Rgn, A);
+          Rgn.clearAux();
+        }
+      }
+    }
+  }
+  return Change;
+}
+} // namespace sandboxir
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index b233d35212f94..4f17aa213bbb0 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -32,9 +32,11 @@ static cl::opt<std::string> UserDefinedPassPipeline(
 SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") {
   if (UserDefinedPassPipeline == DefaultPipelineMagicStr) {
     // TODO: Add passes to the default pipeline. It currently contains:
-    //       - the bottom-up-vectorizer pass
+    //       - Seed collection, which creates seed regions and runs the pipeline
+    //         - Bottom-up Vectorizer pass that starts from a seed
+    //         - Accept or revert IR state pass
     FPM.setPassPipeline(
-        "bottom-up-vec<tr-accept-or-revert>",
+        "seed-collection<bottom-up-vec,tr-accept-or-revert>",
         sandboxir::SandboxVectorizerPassBuilder::createFunctionPass);
   } else {
     // Create the user-defined pipeline.
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
index 0c1ab55e91a5c..e552f0570dd9d 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
@@ -4,6 +4,7 @@
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/SeedCollection.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAcceptOrRevert.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/TransactionAlwaysAccept.h"
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 1332e50252978..cd111365c134c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -600,16 +600,25 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
   }
 
   const VPRecipeBase *R = &VPBB->back();
+  bool IsSwitch = isa<VPInstruction>(R) &&
+                  cast<VPInstruction>(R)->getOpcode() == Instruction::Switch;
   bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
                       match(R, m_BranchOnCond(m_VPValue())) ||
                       match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
   (void)IsCondBranch;
-
-  if (VPBB->getNumSuccessors() >= 2 ||
+  (void)IsSwitch;
+  if (VPBB->getNumSuccessors() == 2 ||
       (VPBB->isExiting() && !VPBB->getParent()->isReplicator())) {
-    assert(IsCondBranch && "block with multiple successors not terminated by "
-                           "conditional branch recipe");
+    assert((IsCondBranch || IsSwitch) &&
+           "block with multiple successors not terminated by "
+           "conditional branch nor switch recipe");
+
+    return true;
+  }
 
+  if (VPBB->getNumSuccessors() > 2) {
+    assert(IsSwitch && "block with more than 2 successors not terminated by "
+                       "a switch recipe");
     return true;
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
index 70d8575ba82c5..22c2f91ff55f6 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp
@@ -75,7 +75,7 @@ class PlainCFGBuilder {
       : TheLoop(Lp), LI(LI), Plan(P) {}
 
   /// Build plain CFG for TheLoop  and connects it to Plan's entry.
-  void buildPlainCFG();
+  void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
 };
 } // anonymous namespace
 
@@ -242,10 +242,10 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
     // Instruction definition is in outermost loop PH.
     return false;
 
-  // Check whether Instruction definition is in the loop exit.
-  BasicBlock *Exit = TheLoop->getUniqueExitBlock();
-  assert(Exit && "Expected loop with single exit.");
-  if (InstParent == Exit) {
+  // Check whether Instruction definition is in a loop exit.
+  SmallVector<BasicBlock *> ExitBlocks;
+  TheLoop->getExitBlocks(ExitBlocks);
+  if (is_contained(ExitBlocks, InstParent)) {
     // Instruction definition is in outermost loop exit.
     return false;
   }
@@ -288,6 +288,7 @@ VPValue *PlainCFGBuilder::getOrCreateVPOperand(Value *IRVal) {
 void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
                                                   BasicBlock *BB) {
   VPIRBuilder.setInsertPoint(VPBB);
+  // TODO: Model and preserve debug intrinsics in VPlan.
   for (Instruction &InstRef : BB->instructionsWithoutDebug(false)) {
     Instruction *Inst = &InstRef;
 
@@ -313,6 +314,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
       continue;
     }
 
+    if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
+      SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
+      for (auto Case : SI->cases())
+        Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
+      VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
+      continue;
+    }
+
     VPValue *NewVPV;
     if (auto *Phi = dyn_cast<PHINode>(Inst)) {
       // Phi node's operands may have not been visited at this point. We create
@@ -339,7 +348,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
 }
 
 // Main interface to build the plain CFG.
-void PlainCFGBuilder::buildPlainCFG() {
+void PlainCFGBuilder::buildPlainCFG(
+    DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
   // 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
   // skeleton. These were created directly rather than via getOrCreateVPBB(),
   // revisit them now to update BB2VPBB. Note that header/entry and
@@ -428,6 +438,14 @@ void PlainCFGBuilder::buildPlainCFG() {
     // Set VPBB successors. We create empty VPBBs for successors if they don't
     // exist already. Recipes will be created when the successor is visited
     // during the RPO traversal.
+    if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+      SmallVector<VPBlockBase *> Succs = {
+          getOrCreateVPBB(SI->getDefaultDest())};
+      for (auto Case : SI->cases())
+        Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
+      VPBB->setSuccessors(Succs);
+      continue;
+    }
     auto *BI = cast<BranchInst>(BB->getTerminator());
     unsigned NumSuccs = succ_size(BB);
     if (NumSuccs == 1) {
@@ -481,11 +499,14 @@ void PlainCFGBuilder::buildPlainCFG() {
   // have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
   // VPlan operands.
   fixPhiNodes();
+
+  for (const auto &[IRBB, VPB] : BB2VPBB)
+    VPB2IRBB[VPB] = IRBB;
 }
 
 void VPlanHCFGBuilder::buildPlainCFG() {
   PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
-  PCFGBuilder.buildPlainCFG();
+  PCFGBuilder.buildPlainCFG(VPB2IRBB);
 }
 
 // Public interface to build a H-CFG.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
index ad6e2ad90a961..bc853bf7a1395 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h
@@ -53,6 +53,10 @@ class VPlanHCFGBuilder {
   // are introduced.
   VPDominatorTree VPDomTree;
 
+  /// Map of create VP blocks to their input IR basic blocks, if they have been
+  /// created for a input IR basic block.
+  DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
+
   /// Build plain CFG for TheLoop and connects it to Plan's entry.
   void buildPlainCFG();
 
@@ -62,6 +66,14 @@ class VPlanHCFGBuilder {
 
   /// Build H-CFG for TheLoop and update Plan accordingly.
   void buildHierarchicalCFG();
+
+  /// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
+  /// there is no such corresponding block.
+  /// FIXME: This is a temporary workaround to drive the createBlockInMask.
+  /// Remove once mask creation is done on VPlan.
+  BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
+    return VPB2IRBB.lookup(VPB);
+  }
 };
 } // namespace llvm
 
diff --git a/llvm/test/Analysis/ScalarEvolution/trunc-simplify.ll b/llvm/test/Analysis/ScalarEvolution/trunc-simplify.ll
index f26478cb13fa3..b461b6a600c65 100644
--- a/llvm/test/Analysis/ScalarEvolution/trunc-simplify.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trunc-simplify.ll
@@ -1,13 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck %s
 
 ; Check that we convert
 ;   trunc(C * a) -> trunc(C) * trunc(a)
 ; if C is a constant.
-; CHECK-LABEL: @trunc_of_mul
 define i8 @trunc_of_mul(i32 %a) {
+; CHECK-LABEL: 'trunc_of_mul'
+; CHECK-NEXT:  Classifying expressions for: @trunc_of_mul
+; CHECK-NEXT:    %b = mul i32 %a, 100
+; CHECK-NEXT:    --> (100 * %a) U: [0,-3) S: [-2147483648,2147483645)
+; CHECK-NEXT:    %c = trunc i32 %b to i8
+; CHECK-NEXT:    --> (100 * (trunc i32 %a to i8)) U: [0,-3) S: [-128,125)
+; CHECK-NEXT:  Determining loop execution counts for: @trunc_of_mul
+;
   %b = mul i32 %a, 100
-  ; CHECK: %c
-  ; CHECK-NEXT: --> (100 * (trunc i32 %a to i8))
   %c = trunc i32 %b to i8
   ret i8 %c
 }
@@ -15,31 +21,43 @@ define i8 @trunc_of_mul(i32 %a) {
 ; Check that we convert
 ;   trunc(C + a) -> trunc(C) + trunc(a)
 ; if C is a constant.
-; CHECK-LABEL: @trunc_of_add
 define i8 @trunc_of_add(i32 %a) {
+; CHECK-LABEL: 'trunc_of_add'
+; CHECK-NEXT:  Classifying expressions for: @trunc_of_add
+; CHECK-NEXT:    %b = add i32 %a, 100
+; CHECK-NEXT:    --> (100 + %a) U: full-set S: full-set
+; CHECK-NEXT:    %c = trunc i32 %b to i8
+; CHECK-NEXT:    --> (100 + (trunc i32 %a to i8)) U: full-set S: full-set
+; CHECK-NEXT:  Determining loop execution counts for: @trunc_of_add
+;
   %b = add i32 %a, 100
-  ; CHECK: %c
-  ; CHECK-NEXT: --> (100 + (trunc i32 %a to i8))
   %c = trunc i32 %b to i8
   ret i8 %c
 }
 
 ; Check that we truncate to zero values assumed to have at least as many
 ; trailing zeros as the target type.
-; CHECK-LABEL: @trunc_to_assumed_zeros
 define i8 @trunc_to_assumed_zeros(ptr %p) {
+; CHECK-LABEL: 'trunc_to_assumed_zeros'
+; CHECK-NEXT:  Classifying expressions for: @trunc_to_assumed_zeros
+; CHECK-NEXT:    %a = load i32, ptr %p, align 4
+; CHECK-NEXT:    --> %a U: [0,-255) S: [-2147483648,2147483393)
+; CHECK-NEXT:    %and = and i32 %a, 255
+; CHECK-NEXT:    --> 0 U: [0,1) S: [0,1)
+; CHECK-NEXT:    %c = trunc i32 %a to i8
+; CHECK-NEXT:    --> 0 U: [0,1) S: [0,1)
+; CHECK-NEXT:    %d = trunc i32 %a to i1
+; CHECK-NEXT:    --> false U: [0,-1) S: [0,-1)
+; CHECK-NEXT:    %e = trunc i32 %a to i16
+; CHECK-NEXT:    --> (trunc i32 %a to i16) U: [0,-255) S: [-32768,32513)
+; CHECK-NEXT:  Determining loop execution counts for: @trunc_to_assumed_zeros
+;
   %a = load i32, ptr %p
   %and = and i32 %a, 255
   %cmp = icmp eq i32 %and, 0
   tail call void @llvm.assume(i1 %cmp)
-  ; CHECK: %c
-  ; CHECK-NEXT: --> 0
   %c = trunc i32 %a to i8
-  ; CHECK: %d
-  ; CHECK-NEXT: --> false
   %d = trunc i32 %a to i1
-  ; CHECK: %e
-  ; CHECK-NEXT: --> (trunc i32 %a to i16)
   %e = trunc i32 %a to i16
   ret i8 %c
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
index 4a151aeca87e4..6171c73d8d2dc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.make.buffer.rsrc.ll
@@ -25,7 +25,7 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) {
   ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; CHECK-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -43,7 +43,7 @@ define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) {
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
   ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
   %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   ret float %loaded
 }
@@ -74,7 +74,7 @@ define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) {
   ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY5]], implicit $exec
   ; CHECK-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 1234, i32 5678)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -104,7 +104,7 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %nu
   ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY7]], implicit $exec
   ; CHECK-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 %numVals, i32 %flags)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -136,7 +136,7 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride,
   ; CHECK-NEXT:   [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sreg_32 = V_READFIRSTLANE_B32 [[COPY8]], implicit $exec
   ; CHECK-NEXT:   $sgpr3 = COPY [[V_READFIRSTLANE_B32_3]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1, implicit $sgpr2, implicit $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -161,7 +161,7 @@ define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 i
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_2]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
   ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
   %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret float %value
 }
@@ -221,12 +221,52 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3
   ; CHECK-NEXT: bb.5:
   ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
   %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret float %value
 }
 
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr nocapture readnone, i16, i32, i32)
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32)
+define amdgpu_ps float @read_buffer_fat_ptr_p0(ptr inreg %p) {
+  ; CHECK-LABEL: name: read_buffer_fat_ptr_p0
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i32 0, i32 0)
+  %loaded = load float, ptr addrspace(7) %ptr
+  ret float %loaded
+}
+
+define amdgpu_ps float @read_buffer_fat_ptr_p1(ptr addrspace(1) inreg %p) {
+  ; CHECK-LABEL: name: read_buffer_fat_ptr_p1
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY1]], [[S_MOV_B32_1]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_]], %subreg.sub2, [[S_MOV_B32_]], %subreg.sub3
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
+  %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
+  %loaded = load float, ptr addrspace(7) %ptr
+  ret float %loaded
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr nocapture readnone, i16, i32, i32)
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32)
+declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr nocapture readnone, i16, i32, i32)
+declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32)
 declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg)
 declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
index b4840bce53d2c..3aa5ea995559f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.make.buffer.rsrc.ll
@@ -18,7 +18,7 @@ define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr inreg %p) {
   ; CHECK-NEXT:   $sgpr2 = COPY [[S_MOV_B32_1]]
   ; CHECK-NEXT:   $sgpr3 = COPY [[S_MOV_B32_2]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 1234, i32 5678)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 1234, i32 5678)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -36,7 +36,7 @@ define amdgpu_ps float @read_raw_buffer(ptr addrspace(1) inreg %p) {
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 4, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
   ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
   %loaded = call float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   ret float %loaded
 }
@@ -59,7 +59,7 @@ define amdgpu_ps ptr addrspace(8) @basic_struct_buffer(ptr inreg %p) {
   ; CHECK-NEXT:   $sgpr2 = COPY [[S_MOV_B32_2]]
   ; CHECK-NEXT:   $sgpr3 = COPY [[S_MOV_B32_3]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 1234, i32 5678)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 1234, i32 5678)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -81,7 +81,7 @@ define amdgpu_ps ptr addrspace(8) @variable_top_half(ptr inreg %p, i32 inreg %nu
   ; CHECK-NEXT:   $sgpr2 = COPY [[COPY1]]
   ; CHECK-NEXT:   $sgpr3 = COPY [[COPY]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 4, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 4, i32 %numVals, i32 %flags)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -104,7 +104,7 @@ define amdgpu_ps ptr addrspace(8) @general_case(ptr inreg %p, i16 inreg %stride,
   ; CHECK-NEXT:   $sgpr2 = COPY [[COPY1]]
   ; CHECK-NEXT:   $sgpr3 = COPY [[COPY]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1, $sgpr2, $sgpr3
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
   ret ptr addrspace(8) %rsrc
 }
 
@@ -128,7 +128,7 @@ define amdgpu_ps float @general_case_load(ptr inreg %p, i16 inreg %stride, i32 i
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
   ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
   %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret float %value
 }
@@ -153,12 +153,52 @@ define amdgpu_ps float @general_case_load_with_waterfall(ptr %p, i16 %stride, i3
   ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN [[COPY5]], killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.rsrc, align 1, addrspace 8)
   ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_IDXEN]]
   ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 %stride, i32 %numVals, i32 %flags)
   %value = call float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
   ret float %value
 }
 
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr nocapture readnone, i16, i32, i32)
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32)
+define amdgpu_ps float @read_buffer_fat_ptr_p0(ptr inreg %p) {
+  ; CHECK-LABEL: name: read_buffer_fat_ptr_p0
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr %p, i16 0, i32 0, i32 0)
+  %loaded = load float, ptr addrspace(7) %ptr
+  ret float %loaded
+}
+
+define amdgpu_ps float @read_buffer_fat_ptr_p1(ptr addrspace(1) inreg %p) {
+  ; CHECK-LABEL: name: read_buffer_fat_ptr_p1
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $sgpr0, $sgpr1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+  ; CHECK-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+  ; CHECK-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 [[COPY]], killed [[S_MOV_B32_]], implicit-def dead $scc
+  ; CHECK-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 0
+  ; CHECK-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, killed [[S_AND_B32_]], %subreg.sub1, [[S_MOV_B32_1]], %subreg.sub2, [[S_MOV_B32_1]], %subreg.sub3
+  ; CHECK-NEXT:   [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET killed [[REG_SEQUENCE]], [[S_MOV_B32_1]], 0, 0, 0, implicit $exec :: (dereferenceable load (s32) from %ir.ptr, align 1, addrspace 8)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[BUFFER_LOAD_DWORD_OFFSET]]
+  ; CHECK-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  %ptr = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %p, i16 0, i32 0, i32 0)
+  %loaded = load float, ptr addrspace(7) %ptr
+  ret float %loaded
+}
+
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr nocapture readnone, i16, i32, i32)
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32)
+declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p0(ptr nocapture readnone, i16, i32, i32)
+declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32)
 declare float @llvm.amdgcn.raw.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg)
 declare float @llvm.amdgcn.struct.ptr.buffer.load(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
index 99fcbc595ff7f..ea4117b418959 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-pointer-ops.ll
@@ -349,6 +349,20 @@ define <2 x ptr addrspace(7)> @addrspacecast_vec(<2 x ptr addrspace(8)> %buf) {
   ret <2 x ptr addrspace(7)> %ret
 }
 
+declare ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1), i16, i32, i32)
+
+define ptr addrspace(7) @make_buffer_rsrc(ptr addrspace(1) %buf, i16 %stride, i32 %numRecords, i32 %flags) {
+; CHECK-LABEL: define { ptr addrspace(8), i32 } @make_buffer_rsrc
+; CHECK-SAME: (ptr addrspace(1) [[BUF:%.*]], i16 [[STRIDE:%.*]], i32 [[NUMRECORDS:%.*]], i32 [[FLAGS:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[RET:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[BUF]], i16 [[STRIDE]], i32 [[NUMRECORDS]], i32 [[FLAGS]])
+; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue { ptr addrspace(8), i32 } poison, ptr addrspace(8) [[RET]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue { ptr addrspace(8), i32 } [[TMP1]], i32 0, 1
+; CHECK-NEXT:    ret { ptr addrspace(8), i32 } [[TMP2]]
+;
+  %ret = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %buf, i16 %stride, i32 %numRecords, i32 %flags)
+  ret ptr addrspace(7) %ret
+}
+
 define i1 @icmp_eq(ptr addrspace(7) %a, ptr addrspace(7) %b) {
 ; CHECK-LABEL: define i1 @icmp_eq
 ; CHECK-SAME: ({ ptr addrspace(8), i32 } [[A:%.*]], { ptr addrspace(8), i32 } [[B:%.*]]) #[[ATTR0]] {
diff --git a/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll
index 0679686f77eef..4f88077e3b0ee 100644
--- a/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll
+++ b/llvm/test/CodeGen/AMDGPU/make-buffer-rsrc-lds-fails.ll
@@ -3,7 +3,7 @@
 ; RUN: not --crash llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s
 
 define amdgpu_ps ptr addrspace(8) @basic_raw_buffer(ptr addrspace(3) inreg %p) {
-  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p3(ptr addrspace(3) %p, i16 0, i32 1234, i32 5678)
+  %rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p3(ptr addrspace(3) %p, i16 0, i32 1234, i32 5678)
   ret ptr addrspace(8) %rsrc
 }
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p3(ptr addrspace(3) nocapture readnone, i16, i32, i32)
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p3(ptr addrspace(3) nocapture readnone, i16, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
index e2f4d1c6e57bc..0ac3d652050d3 100644
--- a/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptr-buffer-alias-scheduling.ll
@@ -85,8 +85,8 @@ define amdgpu_kernel void @buffers_from_flat_dont_alias(ptr noalias %a.flat, ptr
 ; GISEL-NEXT:    v_mul_f32_e32 v3, v3, v3
 ; GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GISEL-NEXT:    s_endpgm
-  %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %a.flat, i16 0, i32 16, i32 0)
-  %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %b.flat, i16 0, i32 16, i32 0)
+  %a = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %a.flat, i16 0, i32 16, i32 0)
+  %b = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %b.flat, i16 0, i32 16, i32 0)
 
   %l0 = call float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8) %a, i32 0, i32 0, i32 0)
   %s0 = fmul float %l0, %l0
@@ -211,4 +211,4 @@ declare i32 @llvm.amdgcn.workitem.id.x()
 
 declare float @llvm.amdgcn.raw.ptr.buffer.load.f32(ptr addrspace(8), i32, i32, i32)
 declare void @llvm.amdgcn.raw.ptr.buffer.store.f32(float, ptr addrspace(8), i32, i32, i32 immarg)
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr readnone nocapture, i16, i32, i32)
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr readnone nocapture, i16, i32, i32)
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll
new file mode 100644
index 0000000000000..ad2aa7997eba9
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-function.ll
@@ -0,0 +1,26 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: First element of root signature is not a Function
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+define void @anotherMain() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3 } ; function, root signature
+!3 = !{ !4 } ; list of root signature elements
+!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
+!5 = !{ i32 -1, !6 } ; function, root signature
+!6 = !{ !7 } ; list of root signature elements
+!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll
new file mode 100644
index 0000000000000..4d881f96e4c3b
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-is-not-value.ll
@@ -0,0 +1,26 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: First element of root signature is not a Value
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+define void @anotherMain() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
+!2 = !{ ptr @main, !3 } ; function, root signature
+!3 = !{ !4 } ; list of root signature elements
+!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
+!5 = !{ !3, !6 } ; function, root signature
+!6 = !{ !7 } ; list of root signature elements
+!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-no-root-element-list.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-no-root-element-list.ll
new file mode 100644
index 0000000000000..b5109022b4b0d
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-no-root-element-list.ll
@@ -0,0 +1,26 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Root Element mdnode is null.
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+define void @anotherMain() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
+!2 = !{ ptr @main, null } ; function, root signature
+!3 = !{ !4 } ; list of root signature elements
+!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
+!5 = !{ i32 -1, !6 } ; function, root signature
+!6 = !{ !7 } ; list of root signature elements
+!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-root-element-not-mdnode.ll b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-root-element-not-mdnode.ll
new file mode 100644
index 0000000000000..7e6bcdadd3862
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/ContainerData/RootSignature-Error-root-element-not-mdnode.ll
@@ -0,0 +1,26 @@
+; RUN: not opt -passes='print<dxil-root-signature>' %s -S -o - 2>&1 | FileCheck %s
+
+target triple = "dxil-unknown-shadermodel6.0-compute"
+
+; CHECK: error: Root Element is not a metadata node.
+; CHECK-NOT: Root Signature Definitions
+
+define void @main() #0 {
+entry:
+  ret void
+}
+
+define void @anotherMain() #0 {
+entry:
+  ret void
+}
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+
+!dx.rootsignatures = !{!2, !5} ; list of function/root signature pairs
+!2 = !{ ptr @main, i32 -1 } ; function, root signature
+!3 = !{ !4 } ; list of root signature elements
+!4 = !{ !"RootFlags", i32 1 } ; 1 = allow_input_assembler_input_layout
+!5 = !{ i32 -1, !6 } ; function, root signature
+!6 = !{ !7 } ; list of root signature elements
+!7 = !{ !"RootFlags", i32 2 } ; 1 = allow_input_assembler_input_layout
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
index d2906c4613295..c84f7735b66d4 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-op-info.mir
@@ -1116,10 +1116,10 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: vmop_vv_passthru_use
     ; CHECK: %x:vrnov0 = PseudoVMAND_MM_B8 $noreg, $noreg, 1, 0 /* e8 */
-    ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */
+    ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */, 1
     ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */
     %x:vrnov0 = PseudoVMAND_MM_B8 $noreg, $noreg, -1, 0 /* e1 */
-    %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */
+    %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */, 1
     %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e1 */
 ...
 ---
@@ -1128,10 +1128,10 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: vmop_vv_passthru_use_incompatible_eew
     ; CHECK: %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0 /* tu, mu */
-    ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */
+    ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */, 1
     ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */
     %x:vrnov0 = PseudoVADD_VV_M1 $noreg, $noreg, $noreg, -1, 3 /* e8 */, 0
-    %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */
+    %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */, 1
     %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e1 */
 ...
 ---
@@ -1140,10 +1140,10 @@ body: |
   bb.0:
     ; CHECK-LABEL: name: vmop_vv_passthru_use_incompatible_emul
     ; CHECK: %x:vrnov0 = PseudoVMAND_MM_B16 $noreg, $noreg, -1, 0 /* e8 */
-    ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */
+    ; CHECK-NEXT: %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */, 1
     ; CHECK-NEXT: %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e8 */
     %x:vrnov0 = PseudoVMAND_MM_B16 $noreg, $noreg, -1, 0 /* e1 */
-    %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */
+    %y:vrnov0 = PseudoVMSEQ_VV_M1_MASK %x, $noreg, $noreg, $noreg, 1, 3 /* e8 */, 1
     %z:vr = PseudoVMAND_MM_B8 %y, $noreg, 1, 0 /* e1 */
 ...
 ---
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 4d269cfff2afe..8449107f39e91 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -510,6 +510,103 @@ define bfloat @fold_ext_trunc2(bfloat %a) nounwind {
   ret bfloat %trunc
 }
 
+define bfloat @fold_from_half(half %a) nounwind {
+; X86-LABEL: fold_from_half:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; X86-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; SSE2-LABEL: fold_from_half:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    callq __extendhfsf2@PLT
+; SSE2-NEXT:    callq __truncsfbf2@PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; FP16-LABEL: fold_from_half:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vcvtsh2ss %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    vcvtneps2bf16 %xmm0, %xmm0
+; FP16-NEXT:    retq
+;
+; AVXNC-LABEL: fold_from_half:
+; AVXNC:       # %bb.0:
+; AVXNC-NEXT:    vcvtph2ps %xmm0, %xmm0
+; AVXNC-NEXT:    {vex} vcvtneps2bf16 %xmm0, %xmm0
+; AVXNC-NEXT:    retq
+  %ext = fpext half %a to float
+  %trunc = fptrunc float %ext to bfloat
+  ret bfloat %trunc
+}
+
+define half @fold_to_half(bfloat %a) nounwind {
+; X86-LABEL: fold_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $16, %eax
+; X86-NEXT:    vmovd %eax, %xmm0
+; X86-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; X86-NEXT:    retl
+;
+; SSE2-LABEL: fold_to_half:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    shll $16, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    callq __truncsfhf2@PLT
+; SSE2-NEXT:    popq %rax
+; SSE2-NEXT:    retq
+;
+; BF16-LABEL: fold_to_half:
+; BF16:       # %bb.0:
+; BF16-NEXT:    vpextrw $0, %xmm0, %eax
+; BF16-NEXT:    shll $16, %eax
+; BF16-NEXT:    vmovd %eax, %xmm0
+; BF16-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
+; BF16-NEXT:    retq
+;
+; FP16-LABEL: fold_to_half:
+; FP16:       # %bb.0:
+; FP16-NEXT:    vmovw %xmm0, %eax
+; FP16-NEXT:    shll $16, %eax
+; FP16-NEXT:    vmovd %eax, %xmm0
+; FP16-NEXT:    vcvtss2sh %xmm0, %xmm0, %xmm0
+; FP16-NEXT:    retq
+  %ext = fpext bfloat %a to float
+  %trunc = fptrunc float %ext to half
+  ret half %trunc
+}
+
+define bfloat @bitcast_from_half(half %a) nounwind {
+; X86-LABEL: bitcast_from_half:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    retl
+;
+; CHECK-LABEL: bitcast_from_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+  %bc = bitcast half %a to bfloat
+  ret bfloat %bc
+}
+
+define half @bitcast_to_half(bfloat %a) nounwind {
+; X86-LABEL: bitcast_to_half:
+; X86:       # %bb.0:
+; X86-NEXT:    vmovsh {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero
+; X86-NEXT:    retl
+;
+; CHECK-LABEL: bitcast_to_half:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    retq
+  %bc = bitcast bfloat %a to half
+  ret half %bc
+}
+
 define <8 x bfloat> @addv(<8 x bfloat> %a, <8 x bfloat> %b) nounwind {
 ; X86-LABEL: addv:
 ; X86:       # %bb.0:
@@ -747,15 +844,15 @@ define <32 x bfloat> @pr63017_2() nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorl %eax, %eax
 ; SSE2-NEXT:    testb %al, %al
-; SSE2-NEXT:    jne .LBB12_1
+; SSE2-NEXT:    jne .LBB16_1
 ; SSE2-NEXT:  # %bb.2: # %cond.load
 ; SSE2-NEXT:    movzwl (%rax), %eax
 ; SSE2-NEXT:    shll $16, %eax
 ; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    jmp .LBB12_3
-; SSE2-NEXT:  .LBB12_1:
+; SSE2-NEXT:    jmp .LBB16_3
+; SSE2-NEXT:  .LBB16_1:
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT:  .LBB12_3:
+; SSE2-NEXT:  .LBB16_3:
 ; SSE2-NEXT:    pushq %r14
 ; SSE2-NEXT:    pushq %rbx
 ; SSE2-NEXT:    subq $88, %rsp
@@ -992,10 +1089,10 @@ define <32 x bfloat> @pr63017_2() nounwind {
 ; AVXNC-NEXT:    vbroadcastss {{.*#+}} ymm0 = [49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024,49024]
 ; AVXNC-NEXT:    xorl %eax, %eax
 ; AVXNC-NEXT:    testb %al, %al
-; AVXNC-NEXT:    jne .LBB12_2
+; AVXNC-NEXT:    jne .LBB16_2
 ; AVXNC-NEXT:  # %bb.1: # %cond.load
 ; AVXNC-NEXT:    vmovups (%rax), %ymm0
-; AVXNC-NEXT:  .LBB12_2:
+; AVXNC-NEXT:  .LBB16_2:
 ; AVXNC-NEXT:    vmovaps %ymm0, %ymm1
 ; AVXNC-NEXT:    retq
   %1 = call <32 x bfloat> @llvm.masked.load.v32bf16.p0(ptr poison, i32 2, <32 x i1> poison, <32 x bfloat> <bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80, bfloat 0xRBF80>)
diff --git a/llvm/test/CodeGen/X86/vector-partial-undef.ll b/llvm/test/CodeGen/X86/vector-partial-undef.ll
index fd41fd53e3be1..4753dba2d468f 100644
--- a/llvm/test/CodeGen/X86/vector-partial-undef.ll
+++ b/llvm/test/CodeGen/X86/vector-partial-undef.ll
@@ -151,9 +151,9 @@ define <8 x i32> @xor_undef_elts_alt(<4 x i32> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [6,1,5,4,3,2,0,7]
 ; AVX-NEXT:    vpermps %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX-NEXT:    retq
   %extend = shufflevector <4 x i32> %x, <4 x i32> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 0, i32 1, i32 undef, i32 undef>
   %bogus_bo = xor <8 x i32> %extend, <i32 42, i32 43, i32 undef, i32 undef, i32 undef, i32 undef, i32 44, i32 12>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 07c770abc65d6..05b0a7c10b4e1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -2469,10 +2469,10 @@ define <8 x i32> @combine_unneeded_subvector1(<8 x i32> %a) {
 ;
 ; AVX2-FAST-ALL-LABEL: combine_unneeded_subvector1:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,6,5,4,7,6,5,4]
 ; AVX2-FAST-ALL-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX2-FAST-ALL-NEXT:    retq
 ;
 ; AVX2-FAST-PERLANE-LABEL: combine_unneeded_subvector1:
diff --git a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
index bc1756f6ca9d1..43fbcfcc600c6 100644
--- a/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/OverwriteStoreBegin.ll
@@ -402,3 +402,33 @@ entry:
   store i64 1, ptr %p, align 1
   ret void
 }
+
+; Verify that we adjust/drop the dereferenceable attribute.
+define void @dereferenceable(ptr nocapture %p) {
+; CHECK-LABEL: @dereferenceable(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 24, i1 false)
+; CHECK-NEXT:    store i32 1, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.memset.p0.i64(ptr dereferenceable(28) align 4 %p, i8 0, i64 28, i1 false)
+  store i32 1, ptr %p, align 4
+  ret void
+}
+
+; Verify that we adjust/drop the dereferenceable_or_null attribute.
+define void @dereferenceable_or_null(ptr nocapture %p) {
+; CHECK-LABEL: @dereferenceable_or_null(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 8
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 4 [[TMP0]], i8 0, i64 20, i1 false)
+; CHECK-NEXT:    store i64 1, ptr [[P]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.memset.p0.i64(ptr dereferenceable_or_null(28) align 4 %p, i8 0, i64 28, i1 false)
+  store i64 1, ptr %p, align 4
+  ret void
+}
diff --git a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
index 59ec2d47bc72c..9ef153183cc9e 100644
--- a/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
+++ b/llvm/test/Transforms/FunctionAttrs/make-buffer-rsrc.ll
@@ -9,8 +9,8 @@ define amdgpu_kernel void @test_make_buffer_rsrc(ptr %p, ptr %q) {
 ; FNATTRS: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; FNATTRS-LABEL: define {{[^@]+}}@test_make_buffer_rsrc
 ; FNATTRS-SAME: (ptr readonly captures(none) [[P:%.*]], ptr writeonly captures(none) [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
-; FNATTRS-NEXT:    [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P]], i16 0, i32 4, i32 822243328)
-; FNATTRS-NEXT:    [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[Q]], i16 0, i32 4, i32 822243328)
+; FNATTRS-NEXT:    [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P]], i16 0, i32 4, i32 822243328)
+; FNATTRS-NEXT:    [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[Q]], i16 0, i32 4, i32 822243328)
 ; FNATTRS-NEXT:    [[V:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) [[P_RSRC]], i32 0, i32 0, i32 0)
 ; FNATTRS-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[V]], ptr addrspace(8) [[Q_RSRC]], i32 0, i32 0, i32 0)
 ; FNATTRS-NEXT:    ret void
@@ -18,21 +18,21 @@ define amdgpu_kernel void @test_make_buffer_rsrc(ptr %p, ptr %q) {
 ; ATTRIBUTOR: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@test_make_buffer_rsrc
 ; ATTRIBUTOR-SAME: (ptr nofree readonly captures(none) [[P:%.*]], ptr nofree writeonly captures(none) [[Q:%.*]]) #[[ATTR0:[0-9]+]] {
-; ATTRIBUTOR-NEXT:    [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[P]], i16 0, i32 4, i32 822243328) #[[ATTR4:[0-9]+]]
-; ATTRIBUTOR-NEXT:    [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr [[Q]], i16 0, i32 4, i32 822243328) #[[ATTR4]]
+; ATTRIBUTOR-NEXT:    [[P_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[P]], i16 0, i32 4, i32 822243328) #[[ATTR4:[0-9]+]]
+; ATTRIBUTOR-NEXT:    [[Q_RSRC:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr [[Q]], i16 0, i32 4, i32 822243328) #[[ATTR4]]
 ; ATTRIBUTOR-NEXT:    [[V:%.*]] = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) readonly captures(none) [[P_RSRC]], i32 0, i32 0, i32 0) #[[ATTR5:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 [[V]], ptr addrspace(8) writeonly captures(none) [[Q_RSRC]], i32 0, i32 0, i32 0) #[[ATTR6:[0-9]+]]
 ; ATTRIBUTOR-NEXT:    ret void
 ;
-  %p.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %p, i16 0, i32 4, i32 822243328)
-  %q.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %q, i16 0, i32 4, i32 822243328)
+  %p.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %p, i16 0, i32 4, i32 822243328)
+  %q.rsrc = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %q, i16 0, i32 4, i32 822243328)
   %v = call i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) %p.rsrc, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.i8(i8 %v, ptr addrspace(8) %q.rsrc, i32 0, i32 0, i32 0)
   ret void
 }
 
 ; Function Attrs: mustprogress nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr readnone, i16, i32, i32) #0
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr readnone, i16, i32, i32) #0
 
 ; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(argmem: read)
 declare i8 @llvm.amdgcn.raw.ptr.buffer.load.i8(ptr addrspace(8) nocapture readonly, i32, i32, i32 immarg) #1
diff --git a/llvm/test/Transforms/InstCombine/select-icmp-and.ll b/llvm/test/Transforms/InstCombine/select-icmp-and.ll
index 516a1e8496b43..c854a770aa74c 100644
--- a/llvm/test/Transforms/InstCombine/select-icmp-and.ll
+++ b/llvm/test/Transforms/InstCombine/select-icmp-and.ll
@@ -794,3 +794,18 @@ define i32 @select_bittest_to_shl_negative_test(i32 %x) {
   %res = add nuw nsw i32 %y, 2
   ret i32 %res
 }
+
+define i8 @select_bittest_to_xor(i8 %x) {
+; CHECK-LABEL: @select_bittest_to_xor(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i8 [[X:%.*]], -1
+; CHECK-NEXT:    call void @use1(i1 [[CMP]])
+; CHECK-NEXT:    [[MASKSEL:%.*]] = xor i8 [[X]], -128
+; CHECK-NEXT:    ret i8 [[MASKSEL]]
+;
+  %cmp = icmp sgt i8 %x, -1
+  call void @use1(i1 %cmp)
+  %and = and i8 %x, 127
+  %or = or i8 %x, -128
+  %masksel = select i1 %cmp, i8 %or, i8 %and
+  ret i8 %masksel
+}
diff --git a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll
index 2e539d03afc1c..e69da434c0caf 100644
--- a/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll
+++ b/llvm/test/Transforms/LICM/AMDGPU/buffer-rsrc-ptrs.ll
@@ -165,8 +165,8 @@ define void @hoistable_buffer_construction_intrinsic(ptr addrspace(1) noalias %p
 ; CHECK-LABEL: define void @hoistable_buffer_construction_intrinsic
 ; CHECK-SAME: (ptr addrspace(1) noalias [[P_GLOBAL:%.*]], ptr addrspace(1) noalias [[Q_GLOBAL:%.*]], i32 [[BOUND:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[P_GLOBAL]], i16 0, i32 0, i32 0)
-; CHECK-NEXT:    [[Q:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) [[Q_GLOBAL]], i16 0, i32 0, i32 0)
+; CHECK-NEXT:    [[P:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[P_GLOBAL]], i16 0, i32 0, i32 0)
+; CHECK-NEXT:    [[Q:%.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) [[Q_GLOBAL]], i16 0, i32 0, i32 0)
 ; CHECK-NEXT:    [[HOISTABLE:%.*]] = call i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) [[Q]], i32 0, i32 0, i32 0, i32 0)
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
@@ -181,8 +181,8 @@ define void @hoistable_buffer_construction_intrinsic(ptr addrspace(1) noalias %p
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %p = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %p.global, i16 0, i32 0, i32 0)
-  %q = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) %q.global, i16 0, i32 0, i32 0)
+  %p = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %p.global, i16 0, i32 0, i32 0)
+  %q = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) %q.global, i16 0, i32 0, i32 0)
   br label %loop
 loop:
   %i = phi i32 [0, %entry], [%next, %loop]
@@ -256,8 +256,8 @@ declare i32 @llvm.amdgcn.raw.ptr.buffer.load.i32(ptr addrspace(8) nocapture read
 declare i32 @llvm.amdgcn.struct.ptr.buffer.load.i32(ptr addrspace(8) nocapture readonly, i32, i32, i32, i32 immarg) #0
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write)
 declare void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32, ptr addrspace(8) nocapture writeonly, i32, i32, i32 immarg) #1
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) #2
-declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p1(ptr addrspace(1) readnone nocapture, i16, i32, i32)
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) nocapture readnone, i16, i32, i32) #2
+declare ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p1(ptr addrspace(1) readnone nocapture, i16, i32, i32)
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: read) }
 attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) }
 attributes #2 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
index ebb5d46cd8438..4e862bf2f7480 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
@@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
 ; CHECK-NEXT:  LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
 ; CHECK-NEXT:  LV: Using user VF vscale x 4.
 ; CHECK-NEXT:  LV: Loop does not require scalar epilogue
-; CHECK-NEXT:  LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
+; CHECK:       LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
 ; CHECK-NEXT:  LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
 ; CHECK-NEXT:  LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
new file mode 100644
index 0000000000000..aeb82d800a2f7
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/unreachable-blocks-with-phis.ll
@@ -0,0 +1,43 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S --passes=slp-vectorizer -mtriple=aarch64  < %s | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: define void @test() {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr null, align 1
+; CHECK-NEXT:    br label %[[IF_END:.*]]
+; CHECK:       [[IF_THEN:.*]]:
+; CHECK-NEXT:    br label %[[IF_END]]
+; CHECK:       [[IF_END]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x i32> [ [[TMP0]], %[[ENTRY]] ], [ poison, %[[IF_THEN]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x i32> [[TMP1]], i32 0
+; CHECK-NEXT:    store i32 [[TMP2]], ptr null, align 1
+; CHECK-NEXT:    br label %[[TRAP:.*]]
+; CHECK:       [[BB3:.*:]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32> [[TMP1]], i32 1
+; CHECK-NEXT:    store i32 [[TMP4]], ptr null, align 1
+; CHECK-NEXT:    ret void
+; CHECK:       [[TRAP]]:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %g_2197.real32.pre = load i32, ptr null, align 1
+  %g_2197.imag33.pre = load i32, ptr getelementptr inbounds nuw ({ i32, i32 }, ptr null, i32 0, i32 1), align 1
+  br label %if.end
+
+if.then:
+  br label %if.end
+
+if.end:
+  %g_2197.imag33 = phi i32 [ %g_2197.imag33.pre, %entry ], [ 0, %if.then ]
+  %g_2197.real32 = phi i32 [ %g_2197.real32.pre, %entry ], [ 0, %if.then ]
+  store i32 %g_2197.real32, ptr  null, align 1
+  br label %trap
+
+0:
+  store i32 %g_2197.imag33, ptr  null, align 1
+  ret void
+
+trap:
+  unreachable
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/small-phi-tree.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/small-phi-tree.ll
new file mode 100644
index 0000000000000..c4f35d8dfc219
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/small-phi-tree.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=riscv64 -mcpu=sifive-x280 -passes=slp-vectorizer -S -slp-threshold=-11 < %s | FileCheck %s
+
+define float @test(ptr %call78) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x ptr> <ptr null, ptr poison>, ptr [[CALL78:%.*]], i32 1
+; CHECK-NEXT:    br label [[FOR_BODY194:%.*]]
+; CHECK:       for.body194:
+; CHECK-NEXT:    [[INDVARS_IV132:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ 0, [[FOR_BODY194]] ]
+; CHECK-NEXT:    [[CURRENTW_031:%.*]] = phi ptr [ [[CALL78]], [[ENTRY]] ], [ [[PREVIOUSW_030:%.*]], [[FOR_BODY194]] ]
+; CHECK-NEXT:    [[PREVIOUSW_030]] = phi ptr [ null, [[ENTRY]] ], [ [[CURRENTW_031]], [[FOR_BODY194]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi <2 x ptr> [ [[TMP0]], [[ENTRY]] ], [ [[TMP3:%.*]], [[FOR_BODY194]] ]
+; CHECK-NEXT:    store float 0.000000e+00, ptr [[CURRENTW_031]], align 4
+; CHECK-NEXT:    tail call void null(ptr [[PREVIOUSW_030]], ptr null, ptr null, i32 0, i32 0, ptr null, ptr null, i32 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x ptr> poison, ptr [[CURRENTW_031]], i32 0
+; CHECK-NEXT:    [[TMP3]] = insertelement <2 x ptr> [[TMP2]], ptr [[PREVIOUSW_030]], i32 1
+; CHECK-NEXT:    br i1 false, label [[FOR_END286_LOOPEXIT:%.*]], label [[FOR_BODY194]]
+; CHECK:       for.end286.loopexit:
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <2 x ptr> [ [[TMP1]], [[FOR_BODY194]] ]
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+entry:
+  br label %for.body194
+
+for.body194:
+  %indvars.iv132 = phi i64 [ 0, %entry ], [ 0, %for.body194 ]
+  %currentw.031 = phi ptr [ %call78, %entry ], [ %previousw.030, %for.body194 ]
+  %previousw.030 = phi ptr [ null, %entry ], [ %currentw.031, %for.body194 ]
+  store float 0.000000e+00, ptr %currentw.031, align 4
+  tail call void null(ptr %previousw.030, ptr null, ptr null, i32 0, i32 0, ptr null, ptr null, i32 0)
+  br i1 false, label %for.end286.loopexit, label %for.body194
+
+for.end286.loopexit:
+  %currentw.031.lcssa = phi ptr [ %currentw.031, %for.body194 ]
+  %previousw.030.lcssa = phi ptr [ %previousw.030, %for.body194 ]
+  ret float 0.000000e+00
+}
+
+
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
index 6b18d4069e0ae..ad23126b681fd 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s
 
 define void @store_load(ptr %ptr) {
 ; CHECK-LABEL: define void @store_load(
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll
index 202b5a6fbd6c9..f4fcc5742f8a7 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s
 
 
 declare void @foo()
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll
index 1b189831569f5..4218ca830dccf 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_seed_slice_pow2.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=false -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s --check-prefix=POW2
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=true -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s --check-prefix=NON-POW2
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=false -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s --check-prefix=POW2
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2=true -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s --check-prefix=NON-POW2
 
 define void @pow2(ptr %ptr, float %val) {
 ; POW2-LABEL: define void @pow2(
diff --git a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll
index ff1604173c317..75d7cc8d9cefe 100644
--- a/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/cross_bbs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s
 
 define void @cross_bbs(ptr %ptr) {
 ; CHECK-LABEL: define void @cross_bbs(
diff --git a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll
index 10de4338caf23..86000da42c799 100644
--- a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll
@@ -5,8 +5,9 @@
 ; This checks the default pass pipeline for the sandbox vectorizer.
 define void @pipeline() {
 ; CHECK: fpm
-; CHECK: bottom-up-vec
+; CHECK: seed-collection
 ; CHECK: rpm
+; CHECK: bottom-up-vec
 ; CHECK: tr-accept-or-revert
 ; CHECK-EMPTY:
   ret void
diff --git a/llvm/test/Transforms/SandboxVectorizer/pack.ll b/llvm/test/Transforms/SandboxVectorizer/pack.ll
index da41036e3a58b..c1e22d3dc2e73 100644
--- a/llvm/test/Transforms/SandboxVectorizer/pack.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/pack.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s
 
 define void @pack_constants(ptr %ptr) {
 ; CHECK-LABEL: define void @pack_constants(
diff --git a/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll b/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll
index add762ac2d894..3e4ef6787a563 100644
--- a/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/repeated_instrs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s
 
 define i32 @repeated_splat(ptr %ptr, i32 %v) #0 {
 ; CHECK-LABEL: define i32 @repeated_splat(
diff --git a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll
index acbec80db6b06..fd4847da920d1 100644
--- a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s
 
 ; This used to crash because the newly added pack   instructions would not update
 ; the DAG and scheduler, leading to def-after-use.
diff --git a/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll b/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll
index e8fe8b4fa88e3..1aca7cf2a8bd8 100644
--- a/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/special_opcodes.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="bottom-up-vec<tr-accept>" %s -S | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-vec-reg-bits=1024 -sbvec-allow-non-pow2 -sbvec-passes="seed-collection<bottom-up-vec,tr-accept>" %s -S | FileCheck %s
 
 ; This file includes tests for opcodes that need special checks.
 
diff --git a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll
index b11b55ed96019..2c57a8e7347d2 100644
--- a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll
@@ -1,9 +1,9 @@
 ; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline \
-; RUN:     -disable-output -sbvec-passes="bottom-up-vec<null,null>" %s \
+; RUN:     -disable-output -sbvec-passes="seed-collection<null,null>" %s \
 ; RUN:     | FileCheck %s
 ;
 ; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline \
-; RUN:     -disable-output -sbvec-passes="bottom-up-vec<>,regions-from-metadata<>" %s \
+; RUN:     -disable-output -sbvec-passes="seed-collection<>,regions-from-metadata<>" %s \
 ; RUN:     | FileCheck --check-prefix CHECK-MULTIPLE-FUNCTION-PASSES %s
 
 ; !!!WARNING!!! This won't get updated by update_test_checks.py !
@@ -14,14 +14,14 @@ define void @pipeline() {
 }
 
 ; CHECK: fpm
-; CHECK: bottom-up-vec
+; CHECK: seed-collection
 ; CHECK: rpm
 ; CHECK: null
 ; CHECK: null
 ; CHECK-EMPTY:
 
 ; CHECK-MULTIPLE-FUNCTION-PASSES: fpm
-; CHECK-MULTIPLE-FUNCTION-PASSES: bottom-up-vec
+; CHECK-MULTIPLE-FUNCTION-PASSES: seed-collection
 ; CHECK-MULTIPLE-FUNCTION-PASSES: rpm
 ; CHECK-MULTIPLE-FUNCTION-PASSES: regions-from-metadata
 ; CHECK-MULTIPLE-FUNCTION-PASSES: rpm
diff --git a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
index 83c8f7e932b2b..a1ea7849d7c0c 100644
--- a/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
+++ b/llvm/unittests/Frontend/OpenMPIRBuilderTest.cpp
@@ -5928,6 +5928,7 @@ TEST_F(OpenMPIRBuilderTest, TargetEnterData) {
     return CombinedInfo;
   };
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   llvm::OpenMPIRBuilder::TargetDataInfo Info(
       /*RequiresDevicePointerInfo=*/false,
       /*SeparateBeginEndCalls=*/true);
@@ -5939,7 +5940,7 @@ TEST_F(OpenMPIRBuilderTest, TargetEnterData) {
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTargetData(
           Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
-          /* IfCond= */ nullptr, Info, GenMapInfoCB, &RTLFunc));
+          /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, &RTLFunc));
   Builder.restoreIP(AfterIP);
 
   CallInst *TargetDataCall = dyn_cast<CallInst>(&BB->back());
@@ -5990,6 +5991,7 @@ TEST_F(OpenMPIRBuilderTest, TargetExitData) {
     return CombinedInfo;
   };
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   llvm::OpenMPIRBuilder::TargetDataInfo Info(
       /*RequiresDevicePointerInfo=*/false,
       /*SeparateBeginEndCalls=*/true);
@@ -6001,7 +6003,7 @@ TEST_F(OpenMPIRBuilderTest, TargetExitData) {
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTargetData(
           Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
-          /* IfCond= */ nullptr, Info, GenMapInfoCB, &RTLFunc));
+          /* IfCond= */ nullptr, Info, GenMapInfoCB, CustomMapperCB, &RTLFunc));
   Builder.restoreIP(AfterIP);
 
   CallInst *TargetDataCall = dyn_cast<CallInst>(&BB->back());
@@ -6074,6 +6076,7 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) {
     return CombinedInfo;
   };
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   llvm::OpenMPIRBuilder::TargetDataInfo Info(
       /*RequiresDevicePointerInfo=*/true,
       /*SeparateBeginEndCalls=*/true);
@@ -6110,9 +6113,10 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) {
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, TargetDataIP1,
-      OMPBuilder.createTargetData(
-          Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
-          /* IfCond= */ nullptr, Info, GenMapInfoCB, nullptr, BodyCB));
+      OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(),
+                                  Builder.getInt64(DeviceID),
+                                  /* IfCond= */ nullptr, Info, GenMapInfoCB,
+                                  CustomMapperCB, nullptr, BodyCB));
   Builder.restoreIP(TargetDataIP1);
 
   CallInst *TargetDataCall = dyn_cast<CallInst>(&BB->back());
@@ -6138,9 +6142,10 @@ TEST_F(OpenMPIRBuilderTest, TargetDataRegion) {
   };
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, TargetDataIP2,
-      OMPBuilder.createTargetData(
-          Loc, AllocaIP, Builder.saveIP(), Builder.getInt64(DeviceID),
-          /* IfCond= */ nullptr, Info, GenMapInfoCB, nullptr, BodyTargetCB));
+      OMPBuilder.createTargetData(Loc, AllocaIP, Builder.saveIP(),
+                                  Builder.getInt64(DeviceID),
+                                  /* IfCond= */ nullptr, Info, GenMapInfoCB,
+                                  CustomMapperCB, nullptr, BodyTargetCB));
   Builder.restoreIP(TargetDataIP2);
   EXPECT_TRUE(CheckDevicePassBodyGen);
 
@@ -6241,6 +6246,11 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
     return CombinedInfos;
   };
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
+  llvm::OpenMPIRBuilder::TargetDataInfo Info(
+      /*RequiresDevicePointerInfo=*/false,
+      /*SeparateBeginEndCalls=*/true);
+
   TargetRegionEntryInfo EntryInfo("func", 42, 4711, 17);
   OpenMPIRBuilder::LocationDescription OmpLoc({Builder.saveIP(), DL});
   OpenMPIRBuilder::TargetKernelRuntimeAttrs RuntimeAttrs;
@@ -6254,9 +6264,10 @@ TEST_F(OpenMPIRBuilderTest, TargetRegion) {
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(),
-                              Builder.saveIP(), EntryInfo, DefaultAttrs,
+                              Builder.saveIP(), Info, EntryInfo, DefaultAttrs,
                               RuntimeAttrs, /*IfCond=*/nullptr, Inputs,
-                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
+                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB,
+                              CustomMapperCB, {}, false));
   EXPECT_EQ(DL, Builder.getCurrentDebugLocation());
   Builder.restoreIP(AfterIP);
 
@@ -6400,6 +6411,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
     return CombinedInfos;
   };
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
                        OpenMPIRBuilder::InsertPointTy CodeGenIP)
       -> OpenMPIRBuilder::InsertPointTy {
@@ -6419,13 +6431,17 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDevice) {
   OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
       /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC,
       /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
+  llvm::OpenMPIRBuilder::TargetDataInfo Info(
+      /*RequiresDevicePointerInfo=*/false,
+      /*SeparateBeginEndCalls=*/true);
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              EntryInfo, DefaultAttrs, RuntimeAttrs,
+                              Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
                               /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB,
-                              BodyGenCB, SimpleArgAccessorCB));
+                              BodyGenCB, SimpleArgAccessorCB, CustomMapperCB,
+                              {}, false));
   EXPECT_EQ(DL, Builder.getCurrentDebugLocation());
   Builder.restoreIP(AfterIP);
 
@@ -6549,6 +6565,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
   F->setName("func");
   IRBuilder<> Builder(BB);
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   auto BodyGenCB = [&](InsertPointTy,
                        InsertPointTy CodeGenIP) -> InsertPointTy {
     Builder.restoreIP(CodeGenIP);
@@ -6576,13 +6593,17 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionSPMD) {
       /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD,
       /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
   RuntimeAttrs.LoopTripCount = Builder.getInt64(1000);
+  llvm::OpenMPIRBuilder::TargetDataInfo Info(
+      /*RequiresDevicePointerInfo=*/false,
+      /*SeparateBeginEndCalls=*/true);
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(OmpLoc, /*IsOffloadEntry=*/true, Builder.saveIP(),
-                              Builder.saveIP(), EntryInfo, DefaultAttrs,
+                              Builder.saveIP(), Info, EntryInfo, DefaultAttrs,
                               RuntimeAttrs, /*IfCond=*/nullptr, Inputs,
-                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB));
+                              GenMapInfoCB, BodyGenCB, SimpleArgAccessorCB,
+                              CustomMapperCB));
   Builder.restoreIP(AfterIP);
 
   OMPBuilder.finalize();
@@ -6663,6 +6684,7 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
     return CombinedInfos;
   };
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy,
                        OpenMPIRBuilder::InsertPointTy CodeGenIP)
       -> OpenMPIRBuilder::InsertPointTy {
@@ -6679,13 +6701,16 @@ TEST_F(OpenMPIRBuilderTest, TargetRegionDeviceSPMD) {
   OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
       /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_SPMD,
       /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
+  llvm::OpenMPIRBuilder::TargetDataInfo Info(
+      /*RequiresDevicePointerInfo=*/false,
+      /*SeparateBeginEndCalls=*/true);
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              EntryInfo, DefaultAttrs, RuntimeAttrs,
+                              Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
                               /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB,
-                              BodyGenCB, SimpleArgAccessorCB));
+                              BodyGenCB, SimpleArgAccessorCB, CustomMapperCB));
   Builder.restoreIP(AfterIP);
 
   Builder.CreateRetVoid();
@@ -6779,6 +6804,7 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
 
   llvm::Value *RaiseAlloca = nullptr;
 
+  auto CustomMapperCB = [&](unsigned int I) { return nullptr; };
   auto BodyGenCB = [&](OpenMPIRBuilder::InsertPointTy AllocaIP,
                        OpenMPIRBuilder::InsertPointTy CodeGenIP)
       -> OpenMPIRBuilder::InsertPointTy {
@@ -6799,13 +6825,17 @@ TEST_F(OpenMPIRBuilderTest, ConstantAllocaRaise) {
   OpenMPIRBuilder::TargetKernelDefaultAttrs DefaultAttrs = {
       /*ExecFlags=*/omp::OMPTgtExecModeFlags::OMP_TGT_EXEC_MODE_GENERIC,
       /*MaxTeams=*/{-1}, /*MinTeams=*/0, /*MaxThreads=*/{0}, /*MinThreads=*/0};
+  llvm::OpenMPIRBuilder::TargetDataInfo Info(
+      /*RequiresDevicePointerInfo=*/false,
+      /*SeparateBeginEndCalls=*/true);
 
   ASSERT_EXPECTED_INIT(
       OpenMPIRBuilder::InsertPointTy, AfterIP,
       OMPBuilder.createTarget(Loc, /*IsOffloadEntry=*/true, EntryIP, EntryIP,
-                              EntryInfo, DefaultAttrs, RuntimeAttrs,
+                              Info, EntryInfo, DefaultAttrs, RuntimeAttrs,
                               /*IfCond=*/nullptr, CapturedArgs, GenMapInfoCB,
-                              BodyGenCB, SimpleArgAccessorCB));
+                              BodyGenCB, SimpleArgAccessorCB, CustomMapperCB,
+                              {}, false));
   EXPECT_EQ(DL, Builder.getCurrentDebugLocation());
   Builder.restoreIP(AfterIP);
 
diff --git a/llvm/unittests/SandboxIR/RegionTest.cpp b/llvm/unittests/SandboxIR/RegionTest.cpp
index 09c578bcfefaa..992c3f2cbd2ea 100644
--- a/llvm/unittests/SandboxIR/RegionTest.cpp
+++ b/llvm/unittests/SandboxIR/RegionTest.cpp
@@ -386,6 +386,49 @@ define void @foo(i8 %v) {
                      ".*already.*");
 }
 
+// Check that Aux automatically drops instructions that get deleted.
+TEST_F(RegionTest, AuxDeleteInstr) {
+  parseIR(C, R"IR(
+define void @foo(i8 %v) {
+  %Add0 = add i8 %v, 0, !sandboxvec !0, !sandboxaux !1
+  %Add1 = add i8 %v, 1, !sandboxvec !0, !sandboxaux !2
+  %Add2 = add i8 %v, 2, !sandboxvec !0, !sandboxaux !3
+  %Add3 = add i8 %v, 2, !sandboxvec !0, !sandboxaux !4
+  ret void
+}
+
+!0 = distinct !{!"sandboxregion"}
+!1 = !{i32 0}
+!2 = !{i32 1}
+!3 = !{i32 2}
+!4 = !{i32 3}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *Add0 = &*It++;
+  auto *Add1 = &*It++;
+  auto *Add2 = &*It++;
+  auto *Add3 = &*It++;
+  SmallVector<std::unique_ptr<sandboxir::Region>> Regions =
+      sandboxir::Region::createRegionsFromMD(*F, *TTI);
+  auto &R = *Regions[0];
+  EXPECT_THAT(R.getAux(), testing::ElementsAre(Add0, Add1, Add2, Add3));
+  // Now delete Add1 and check that Aux contains nullptr instead of Add1.
+  Add2->eraseFromParent();
+  EXPECT_THAT(R.getAux(), testing::ElementsAre(Add0, Add1, Add3));
+  {
+    // Check that metadata have also been updated.
+    // But first drop Add3 to create a legal Aux vector with no gaps.
+    Add3->eraseFromParent();
+    SmallVector<std::unique_ptr<sandboxir::Region>> Regions =
+        sandboxir::Region::createRegionsFromMD(*F, *TTI);
+    EXPECT_THAT(Regions[0]->getAux(), testing::ElementsAre(Add0, Add1));
+  }
+}
+
 TEST_F(RegionTest, AuxRoundTrip) {
   parseIR(C, R"IR(
 define i8 @foo(i8 %v0, i8 %v1) {
diff --git a/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp b/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp
index 68338b569a208..63ac8f993ecdc 100644
--- a/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVTargetParserTest.cpp
@@ -14,20 +14,20 @@ using namespace llvm;
 namespace {
 TEST(RISCVVType, CheckSameRatioLMUL) {
   // Smaller LMUL.
-  EXPECT_EQ(RISCVII::LMUL_1,
-            RISCVVType::getSameRatioLMUL(16, RISCVII::LMUL_2, 8));
-  EXPECT_EQ(RISCVII::LMUL_F2,
-            RISCVVType::getSameRatioLMUL(16, RISCVII::LMUL_1, 8));
+  EXPECT_EQ(RISCVVType::LMUL_1,
+            RISCVVType::getSameRatioLMUL(16, RISCVVType::LMUL_2, 8));
+  EXPECT_EQ(RISCVVType::LMUL_F2,
+            RISCVVType::getSameRatioLMUL(16, RISCVVType::LMUL_1, 8));
   // Smaller fractional LMUL.
-  EXPECT_EQ(RISCVII::LMUL_F8,
-            RISCVVType::getSameRatioLMUL(16, RISCVII::LMUL_F4, 8));
+  EXPECT_EQ(RISCVVType::LMUL_F8,
+            RISCVVType::getSameRatioLMUL(16, RISCVVType::LMUL_F4, 8));
   // Bigger LMUL.
-  EXPECT_EQ(RISCVII::LMUL_2,
-            RISCVVType::getSameRatioLMUL(8, RISCVII::LMUL_1, 16));
-  EXPECT_EQ(RISCVII::LMUL_1,
-            RISCVVType::getSameRatioLMUL(8, RISCVII::LMUL_F2, 16));
+  EXPECT_EQ(RISCVVType::LMUL_2,
+            RISCVVType::getSameRatioLMUL(8, RISCVVType::LMUL_1, 16));
+  EXPECT_EQ(RISCVVType::LMUL_1,
+            RISCVVType::getSameRatioLMUL(8, RISCVVType::LMUL_F2, 16));
   // Bigger fractional LMUL.
-  EXPECT_EQ(RISCVII::LMUL_F2,
-            RISCVVType::getSameRatioLMUL(8, RISCVII::LMUL_F4, 16));
+  EXPECT_EQ(RISCVVType::LMUL_F2,
+            RISCVVType::getSameRatioLMUL(8, RISCVVType::LMUL_F4, 16));
 }
 } // namespace
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
index bb809bf33420e..9a7ee8214d10a 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/DependencyGraphTest.cpp
@@ -940,10 +940,11 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %new1, i8 %new2) {
 
 TEST_F(DependencyGraphTest, EraseInstrCallback) {
   parseIR(C, R"IR(
-define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %arg) {
+define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %v4, i8 %arg) {
   store i8 %v1, ptr %ptr
   store i8 %v2, ptr %ptr
   store i8 %v3, ptr %ptr
+  store i8 %v4, ptr %ptr
   ret void
 }
 )IR");
@@ -955,17 +956,27 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %arg) {
   auto *S1 = cast<sandboxir::StoreInst>(&*It++);
   auto *S2 = cast<sandboxir::StoreInst>(&*It++);
   auto *S3 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S4NotInDAG = cast<sandboxir::StoreInst>(&*It++);
 
   // Check erase instruction callback.
   sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
   DAG.extend({S1, S3});
+  auto *S1MemN = cast<sandboxir::MemDGNode>(DAG.getNode(S1));
+  auto *S2MemN = cast<sandboxir::MemDGNode>(DAG.getNode(S2));
+  auto *S3MemN = cast<sandboxir::MemDGNode>(DAG.getNode(S3));
+  EXPECT_EQ(S1MemN->getNumUnscheduledSuccs(), 2u);
+  EXPECT_EQ(S2MemN->getNumUnscheduledSuccs(), 1u);
+  EXPECT_EQ(S3MemN->getNumUnscheduledSuccs(), 0u);
   S2->eraseFromParent();
-  auto *DeletedN = DAG.getNodeOrNull(S2);
+  // Check that the DAG Node for S2 no longer exists.
+  auto *DeletedN = DAG.getNode(S2);
   EXPECT_TRUE(DeletedN == nullptr);
+  // Check that dependencies are maintained.
+  EXPECT_THAT(S3MemN->preds(DAG), testing::UnorderedElementsAre(S1MemN));
+  // Also check that UnscheduledSuccs was updated for S1.
+  EXPECT_EQ(S1MemN->getNumUnscheduledSuccs(), 1u);
 
   // Check the MemDGNode chain.
-  auto *S1MemN = cast<sandboxir::MemDGNode>(DAG.getNode(S1));
-  auto *S3MemN = cast<sandboxir::MemDGNode>(DAG.getNode(S3));
   EXPECT_EQ(S1MemN->getNextNode(), S3MemN);
   EXPECT_EQ(S3MemN->getPrevNode(), S1MemN);
 
@@ -973,7 +984,39 @@ define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %arg) {
   S1->eraseFromParent();
   EXPECT_EQ(S3MemN->getPrevNode(), nullptr);
 
-  // TODO: Check the dependencies to/from NewSN after they land.
+  // Check that we don't crash if we erase a node not in the DAG.
+  S4NotInDAG->eraseFromParent();
+}
+
+// Same but check that we don't update UnscheduledSuccs when Node is scheduled.
+TEST_F(DependencyGraphTest, EraseInstrCallbackScheduled) {
+  parseIR(C, R"IR(
+define void @foo(ptr %ptr, i8 %v1, i8 %v2, i8 %v3, i8 %v4, i8 %arg) {
+  store i8 %v1, ptr %ptr
+  store i8 %v2, ptr %ptr
+  ret void
+}
+)IR");
+  llvm::Function *LLVMF = &*M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+  auto *F = Ctx.createFunction(LLVMF);
+  auto *BB = &*F->begin();
+  auto It = BB->begin();
+  auto *S1 = cast<sandboxir::StoreInst>(&*It++);
+  auto *S2 = cast<sandboxir::StoreInst>(&*It++);
+
+  sandboxir::DependencyGraph DAG(getAA(*LLVMF), Ctx);
+  DAG.extend({S1, S2});
+  auto *S1MemN = cast<sandboxir::MemDGNode>(DAG.getNode(S1));
+  auto *S2MemN = cast<sandboxir::MemDGNode>(DAG.getNode(S2));
+  EXPECT_EQ(S1MemN->getNumUnscheduledSuccs(), 1u);
+  EXPECT_EQ(S2MemN->getNumUnscheduledSuccs(), 0u);
+  // Mark S2 as scheduled and erase it.
+  S2MemN->setScheduled(true);
+  S2->eraseFromParent();
+  EXPECT_EQ(DAG.getNode(S2), nullptr);
+  // Check that we did not update S1's UnscheduledSuccs
+  EXPECT_EQ(S1MemN->getNumUnscheduledSuccs(), 1u);
 }
 
 TEST_F(DependencyGraphTest, MoveInstrCallback) {
diff --git a/llvm/utils/TableGen/Common/CodeGenSchedule.h b/llvm/utils/TableGen/Common/CodeGenSchedule.h
index 981782c17c48c..8343257b458dd 100644
--- a/llvm/utils/TableGen/Common/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/Common/CodeGenSchedule.h
@@ -467,23 +467,6 @@ class CodeGenSchedModels {
 public:
   CodeGenSchedModels(const RecordKeeper &RK, const CodeGenTarget &TGT);
 
-  // iterator access to the scheduling classes.
-  using class_iterator = std::vector<CodeGenSchedClass>::iterator;
-  using const_class_iterator = std::vector<CodeGenSchedClass>::const_iterator;
-  class_iterator classes_begin() { return SchedClasses.begin(); }
-  const_class_iterator classes_begin() const { return SchedClasses.begin(); }
-  class_iterator classes_end() { return SchedClasses.end(); }
-  const_class_iterator classes_end() const { return SchedClasses.end(); }
-  iterator_range<class_iterator> classes() {
-    return make_range(classes_begin(), classes_end());
-  }
-  iterator_range<const_class_iterator> classes() const {
-    return make_range(classes_begin(), classes_end());
-  }
-  ArrayRef<CodeGenSchedClass> explicit_classes() const {
-    return ArrayRef(SchedClasses).take_front(NumInstrSchedClasses);
-  }
-
   const Record *getModelOrItinDef(const Record *ProcDef) const {
     const Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
     const Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
@@ -497,13 +480,13 @@ class CodeGenSchedModels {
 
   const CodeGenProcModel &getModelForProc(const Record *ProcDef) const {
     const Record *ModelDef = getModelOrItinDef(ProcDef);
-    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    auto I = ProcModelMap.find(ModelDef);
     assert(I != ProcModelMap.end() && "missing machine model");
     return ProcModels[I->second];
   }
 
   const CodeGenProcModel &getProcModel(const Record *ModelDef) const {
-    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    auto I = ProcModelMap.find(ModelDef);
     assert(I != ProcModelMap.end() && "missing machine model");
     return ProcModels[I->second];
   }
@@ -512,10 +495,6 @@ class CodeGenSchedModels {
         static_cast<const CodeGenSchedModels &>(*this).getProcModel(ModelDef));
   }
 
-  // Iterate over the unique processor models.
-  using ProcIter = std::vector<CodeGenProcModel>::const_iterator;
-  ProcIter procModelBegin() const { return ProcModels.begin(); }
-  ProcIter procModelEnd() const { return ProcModels.end(); }
   ArrayRef<CodeGenProcModel> procModels() const { return ProcModels; }
 
   // Return true if any processors have itineraries.
@@ -564,10 +543,10 @@ class CodeGenSchedModels {
   // for NoItinerary.
   unsigned getSchedClassIdx(const CodeGenInstruction &Inst) const;
 
-  using SchedClassIter = std::vector<CodeGenSchedClass>::const_iterator;
-  SchedClassIter schedClassBegin() const { return SchedClasses.begin(); }
-  SchedClassIter schedClassEnd() const { return SchedClasses.end(); }
   ArrayRef<CodeGenSchedClass> schedClasses() const { return SchedClasses; }
+  ArrayRef<CodeGenSchedClass> explicitSchedClasses() const {
+    return schedClasses().take_front(NumInstrSchedClasses);
+  }
 
   unsigned numInstrSchedClasses() const { return NumInstrSchedClasses; }
 
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 3ea76ed414d91..377bfb593be5f 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -1244,7 +1244,7 @@ void InstrInfoEmitter::emitEnums(
   OS << "#undef GET_INSTRINFO_SCHED_ENUM\n";
   OS << "namespace llvm::" << Namespace << "::Sched {\n\n";
   OS << "  enum {\n";
-  auto ExplictClasses = SchedModels.explicit_classes();
+  auto ExplictClasses = SchedModels.explicitSchedClasses();
   for (const auto &[Idx, Class] : enumerate(ExplictClasses))
     OS << "    " << Class.Name << "\t= " << Idx << ",\n";
   OS << "    SCHED_LIST_END = " << ExplictClasses.size() << '\n';
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index aec05f1ae7742..d4510f2757349 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -106,7 +106,7 @@ class SubtargetEmitter {
   void emitStageAndOperandCycleData(
       raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists);
   void emitItineraries(raw_ostream &OS,
-                       std::vector<std::vector<InstrItinerary>> &ProcItinLists);
+                       ArrayRef<std::vector<InstrItinerary>> ProcItinLists);
   unsigned emitRegisterFileTables(const CodeGenProcModel &ProcModel,
                                   raw_ostream &OS);
   void emitLoadStoreQueueInfo(const CodeGenProcModel &ProcModel,
@@ -477,7 +477,6 @@ void SubtargetEmitter::emitStageAndOperandCycleData(
 
   // Emit functional units for all the itineraries.
   for (const CodeGenProcModel &ProcModel : SchedModels.procModels()) {
-
     if (!ItinsDefSet.insert(ProcModel.ItinsDef).second)
       continue;
 
@@ -489,25 +488,23 @@ void SubtargetEmitter::emitStageAndOperandCycleData(
     OS << "\n// Functional units for \"" << Name << "\"\n"
        << "namespace " << Name << "FU {\n";
 
-    for (unsigned J = 0, FUN = FUs.size(); J < FUN; ++J)
-      OS << "  const InstrStage::FuncUnits " << FUs[J]->getName()
-         << " = 1ULL << " << J << ";\n";
+    for (const auto &[Idx, FU] : enumerate(FUs))
+      OS << "  const InstrStage::FuncUnits " << FU->getName() << " = 1ULL << "
+         << Idx << ";\n";
 
     OS << "} // end namespace " << Name << "FU\n";
 
     ConstRecVec BPs = ProcModel.ItinsDef->getValueAsListOfDefs("BP");
-    if (!BPs.empty()) {
-      OS << "\n// Pipeline forwarding paths for itineraries \"" << Name
-         << "\"\n"
-         << "namespace " << Name << "Bypass {\n";
+    if (BPs.empty())
+      continue;
+    OS << "\n// Pipeline forwarding paths for itineraries \"" << Name << "\"\n"
+       << "namespace " << Name << "Bypass {\n";
 
-      OS << "  const unsigned NoBypass = 0;\n";
-      for (unsigned J = 0, BPN = BPs.size(); J < BPN; ++J)
-        OS << "  const unsigned " << BPs[J]->getName() << " = 1 << " << J
-           << ";\n";
+    OS << "  const unsigned NoBypass = 0;\n";
+    for (const auto &[Idx, BP] : enumerate(BPs))
+      OS << "  const unsigned " << BP->getName() << " = 1 << " << Idx << ";\n";
 
-      OS << "} // end namespace " << Name << "Bypass\n";
-    }
+    OS << "} // end namespace " << Name << "Bypass\n";
   }
 
   // Begin stages table
@@ -647,46 +644,39 @@ void SubtargetEmitter::emitStageAndOperandCycleData(
 // CodeGenSchedClass::Index.
 //
 void SubtargetEmitter::emitItineraries(
-    raw_ostream &OS, std::vector<std::vector<InstrItinerary>> &ProcItinLists) {
+    raw_ostream &OS, ArrayRef<std::vector<InstrItinerary>> ProcItinLists) {
   // Multiple processor models may share an itinerary record. Emit it once.
   SmallPtrSet<const Record *, 8> ItinsDefSet;
 
-  // For each processor's machine model
-  std::vector<std::vector<InstrItinerary>>::iterator ProcItinListsIter =
-      ProcItinLists.begin();
-  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-                                    PE = SchedModels.procModelEnd();
-       PI != PE; ++PI, ++ProcItinListsIter) {
-
-    const Record *ItinsDef = PI->ItinsDef;
+  for (const auto &[Proc, ItinList] :
+       zip_equal(SchedModels.procModels(), ProcItinLists)) {
+    const Record *ItinsDef = Proc.ItinsDef;
     if (!ItinsDefSet.insert(ItinsDef).second)
       continue;
 
-    // Get the itinerary list for the processor.
-    assert(ProcItinListsIter != ProcItinLists.end() && "bad iterator");
-    std::vector<InstrItinerary> &ItinList = *ProcItinListsIter;
-
     // Empty itineraries aren't referenced anywhere in the tablegen output
     // so don't emit them.
     if (ItinList.empty())
       continue;
 
+    // Begin processor itinerary table
     OS << "\n";
-    OS << "static const llvm::InstrItinerary ";
+    OS << "static constexpr llvm::InstrItinerary " << ItinsDef->getName()
+       << "[] = {\n";
 
-    // Begin processor itinerary table
-    OS << ItinsDef->getName() << "[] = {\n";
+    ArrayRef<CodeGenSchedClass> ItinSchedClasses =
+        SchedModels.schedClasses().take_front(ItinList.size());
 
     // For each itinerary class in CodeGenSchedClass::Index order.
-    for (unsigned J = 0, M = ItinList.size(); J < M; ++J) {
-      InstrItinerary &Intinerary = ItinList[J];
-
+    for (const auto &[Idx, Intinerary, SchedClass] :
+         enumerate(ItinList, ItinSchedClasses)) {
       // Emit Itinerary in the form of
-      // { firstStage, lastStage, firstCycle, lastCycle } // index
+      // { NumMicroOps, FirstStage, LastStage, FirstOperandCycle,
+      // LastOperandCycle } // index class name
       OS << "  { " << Intinerary.NumMicroOps << ", " << Intinerary.FirstStage
          << ", " << Intinerary.LastStage << ", " << Intinerary.FirstOperandCycle
-         << ", " << Intinerary.LastOperandCycle << " }"
-         << ", // " << J << " " << SchedModels.getSchedClass(J).Name << "\n";
+         << ", " << Intinerary.LastOperandCycle << " }" << ", // " << Idx << " "
+         << SchedClass.Name << "\n";
     }
     // End processor itinerary table
     OS << "  { 0, uint16_t(~0U), uint16_t(~0U), uint16_t(~0U), uint16_t(~0U) }"
@@ -1442,18 +1432,16 @@ void SubtargetEmitter::emitSchedClassTables(SchedClassTables &SchedTables,
   OS << "}; // " << Target << "ReadAdvanceTable\n";
 
   // Emit a SchedClass table for each processor.
-  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-                                    PE = SchedModels.procModelEnd();
-       PI != PE; ++PI) {
-    if (!PI->hasInstrSchedModel())
+  for (const auto &[Idx, Proc] : enumerate(SchedModels.procModels())) {
+    if (!Proc.hasInstrSchedModel())
       continue;
 
     std::vector<MCSchedClassDesc> &SCTab =
-        SchedTables.ProcSchedClasses[1 + (PI - SchedModels.procModelBegin())];
+        SchedTables.ProcSchedClasses[1 + Idx];
 
     OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup, RetireOOO,"
        << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n";
-    OS << "static const llvm::MCSchedClassDesc " << PI->ModelName
+    OS << "static const llvm::MCSchedClassDesc " << Proc.ModelName
        << "SchedClasses[] = {\n";
 
     // The first class is always invalid. We no way to distinguish it except by
@@ -1480,7 +1468,7 @@ void SubtargetEmitter::emitSchedClassTables(SchedClassTables &SchedTables,
          << format("%2d", MCDesc.ReadAdvanceIdx) << ", "
          << MCDesc.NumReadAdvanceEntries << "}, // #" << SCIdx << '\n';
     }
-    OS << "}; // " << PI->ModelName << "SchedClasses\n";
+    OS << "}; // " << Proc.ModelName << "SchedClasses\n";
   }
 }
 
@@ -1528,14 +1516,10 @@ void SubtargetEmitter::emitProcessorModels(raw_ostream &OS) {
 
     OS << "  " << PM.Index << ", // Processor ID\n";
     if (PM.hasInstrSchedModel())
-      OS << "  " << PM.ModelName << "ProcResources"
-         << ",\n"
-         << "  " << PM.ModelName << "SchedClasses"
-         << ",\n"
+      OS << "  " << PM.ModelName << "ProcResources" << ",\n"
+         << "  " << PM.ModelName << "SchedClasses" << ",\n"
          << "  " << PM.ProcResourceDefs.size() + 1 << ",\n"
-         << "  "
-         << (SchedModels.schedClassEnd() - SchedModels.schedClassBegin())
-         << ",\n";
+         << "  " << SchedModels.schedClasses().size() << ",\n";
     else
       OS << "  nullptr, nullptr, 0, 0,"
          << " // No instruction-level machine model.\n";
@@ -1747,7 +1731,7 @@ void SubtargetEmitter::emitSchedModelHelpersImpl(
                    ? "if (CPUID == "
                    : "if (SchedModel->getProcessorID() == ");
         OS << PI << ") ";
-        OS << "{ // " << (SchedModels.procModelBegin() + PI)->ModelName << '\n';
+        OS << "{ // " << SchedModels.procModels()[PI].ModelName << '\n';
       }
 
       // Now emit transitions associated with processor PI.
diff --git a/mlir/cmake/modules/MLIRDetectPythonEnv.cmake b/mlir/cmake/modules/MLIRDetectPythonEnv.cmake
index 3a87d39c28a06..f7a6fa6248440 100644
--- a/mlir/cmake/modules/MLIRDetectPythonEnv.cmake
+++ b/mlir/cmake/modules/MLIRDetectPythonEnv.cmake
@@ -22,6 +22,21 @@ macro(mlir_configure_python_dev_packages)
     find_package(Python3 ${LLVM_MINIMUM_PYTHON_VERSION}
       COMPONENTS Interpreter ${_python_development_component} REQUIRED)
 
+    # We look for both Python3 and Python, the search algorithm should be
+    # consistent, otherwise disastrous result is almost guaranteed.
+    # Warn if the policies for treating virtual environment are not defined
+    # consistently.
+    # For more details check issue #126162.
+    if(((DEFINED Python_FIND_VIRTUALENV) AND (NOT DEFINED Python3_FIND_VIRTUALENV)) OR
+       ((NOT DEFINED Python_FIND_VIRTUALENV) AND (DEFINED Python3_FIND_VIRTUALENV)))
+      message(WARNING "Only one of Python3_FIND_VIRTUALENV and Python_FIND_VIRTUALENV variables is defined. "
+                      "Make sure that both variables are defined and have the same value.")
+    elseif((DEFINED Python_FIND_VIRTUALENV) AND (DEFINED Python3_FIND_VIRTUALENV) AND
+           (NOT Python_FIND_VIRTUALENV STREQUAL Python3_FIND_VIRTUALENV))
+      message(WARNING "Python3_FIND_VIRTUALENV and Python_FIND_VIRTUALENV are defined differently. "
+                      "Make sure that the variables have the same values.")
+    endif()
+
     # It's a little silly to detect Python a second time, but nanobind's cmake
     # code looks for Python_ not Python3_.
     find_package(Python ${LLVM_MINIMUM_PYTHON_VERSION}
diff --git a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
index 9a890ae24d8fc..139360f8bd3fc 100644
--- a/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
+++ b/mlir/include/mlir/Dialect/GPU/IR/CompilationInterfaces.h
@@ -79,6 +79,12 @@ class TargetOptions {
   std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
   tokenizeCmdOptions() const;
 
+  /// Returns a tokenization of the substr of the command line options that
+  /// starts with `startsWith` and ends with end of the command line options and
+  /// consumes it.
+  std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
+  tokenizeAndRemoveSuffixCmdOptions(llvm::StringRef startsWith);
+
   /// Returns the compilation target.
   CompilationTarget getCompilationTarget() const;
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 7efa4ffa2aa6f..01059e42974d0 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -451,12 +451,12 @@ def ROCDL_GlobalLoadLDSOp :
 def ROCDLBufferRsrc : LLVM_PointerInAddressSpace<8>;
 
 def ROCDL_MakeBufferRsrcOp :
-  ROCDL_IntrOp<"make.buffer.rsrc", [], [0], [Pure], 1>,
+  ROCDL_IntrOp<"make.buffer.rsrc", [0], [0], [Pure], 1>,
   Arguments<(ins LLVM_AnyPointer:$base,
                  I16:$stride,
                  I32:$numRecords,
                  I32:$flags)> {
-  let results = (outs ROCDLBufferRsrc:$res);
+  let results = (outs LLVM_AnyPointer:$res);
   let assemblyFormat = "operands attr-dict `:` type($base) `to` type($res)";
 }
 
diff --git a/mlir/include/mlir/Dialect/Math/IR/MathOps.td b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
index 8a277320e2f91..16ce4e2366c76 100644
--- a/mlir/include/mlir/Dialect/Math/IR/MathOps.td
+++ b/mlir/include/mlir/Dialect/Math/IR/MathOps.td
@@ -560,6 +560,31 @@ def Math_ErfOp : Math_FloatUnaryOp<"erf"> {
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// ErfcOp
+//===----------------------------------------------------------------------===//
+
+def Math_ErfcOp : Math_FloatUnaryOp<"erfc"> {
+  let summary = "complementary error function of the specified value";
+  let description = [{
+
+    The `erfc` operation computes the complementary error function, defined as
+    1-erf(x). This function is part of libm and is needed for accuracy, since
+    simply calculating 1-erf(x) when x is close to 1 will give inaccurate results.
+    It takes one operand of floating point type (i.e., scalar,
+    tensor or vector) and returns one result of the same type. It has no
+    standard attributes.
+
+    Example:
+
+    ```mlir
+    // Scalar error function value.
+    %a = math.erfc %b : f64
+    ```
+  }];
+  let hasFolder = 1;
+}
+
 
 //===----------------------------------------------------------------------===//
 // ExpOp
diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Approximation.h b/mlir/include/mlir/Dialect/Math/Transforms/Approximation.h
index b4ebc2f0f8fcd..ecfdb71817dff 100644
--- a/mlir/include/mlir/Dialect/Math/Transforms/Approximation.h
+++ b/mlir/include/mlir/Dialect/Math/Transforms/Approximation.h
@@ -23,6 +23,14 @@ struct ErfPolynomialApproximation : public OpRewritePattern<math::ErfOp> {
                                 PatternRewriter &rewriter) const final;
 };
 
+struct ErfcPolynomialApproximation : public OpRewritePattern<math::ErfcOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(math::ErfcOp op,
+                                PatternRewriter &rewriter) const final;
+};
+
 } // namespace math
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
index ea7a556297a76..9adc1c6940a15 100644
--- a/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Math/Transforms/Passes.h
@@ -47,6 +47,7 @@ struct MathPolynomialApproximationOptions {
 
 void populatePolynomialApproximateTanhPattern(RewritePatternSet &patterns);
 void populatePolynomialApproximateErfPattern(RewritePatternSet &patterns);
+void populatePolynomialApproximateErfcPattern(RewritePatternSet &patterns);
 
 // Adds patterns to convert to f32 around math functions for which `predicate`
 // returns true.
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 983627027ac9c..2d8e022190f62 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -653,7 +653,7 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
     will be executed in parallel by threads in the current context. These
     iterations are spread across threads that already exist in the enclosing
     region.
-    
+
     The body region can only contain a single block which must contain a single
     operation. This operation must be another compatible loop wrapper or an
     `omp.loop_nest`.
@@ -1023,6 +1023,7 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
                        OptionalAttr<IndexListArrayAttr>:$members_index,
                        Variadic<OpenMP_MapBoundsType>:$bounds, /* rank-0 to rank-{n-1} */
                        OptionalAttr<UI64Attr>:$map_type,
+                       OptionalAttr<FlatSymbolRefAttr>:$mapper_id,
                        OptionalAttr<VariableCaptureKindAttr>:$map_capture_type,
                        OptionalAttr<StrAttr>:$name,
                        DefaultValuedAttr<BoolAttr, "false">:$partial_map);
@@ -1076,6 +1077,8 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
     - 'map_type': OpenMP map type for this map capture, for example: from, to and
        always. It's a bitfield composed of the OpenMP runtime flags stored in
        OpenMPOffloadMappingFlags.
+    - 'mapper_id': OpenMP mapper map type modifier for this map capture. It's used to
+       specify a user defined mapper to be used for mapping.
     - 'map_capture_type': Capture type for the variable e.g. this, byref, byvalue, byvla
        this can affect how the variable is lowered.
     - `name`: Holds the name of variable as specified in user clause (including bounds).
@@ -1087,6 +1090,7 @@ def MapInfoOp : OpenMP_Op<"map.info", [AttrSizedOperandSegments]> {
     `var_ptr` `(` $var_ptr `:` type($var_ptr) `,` $var_type `)`
     oilist(
         `var_ptr_ptr` `(` $var_ptr_ptr `:` type($var_ptr_ptr) `)`
+      | `mapper` `(` $mapper_id `)`
       | `map_clauses` `(` custom<MapClause>($map_type) `)`
       | `capture` `(` custom<CaptureType>($map_capture_type) `)`
       | `members` `(` $members `:` custom<MembersIndex>($members_index) `:` type($members) `)`
@@ -1749,6 +1753,62 @@ def ScanOp : OpenMP_Op<"scan", [
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// 2.19.7.3 Declare Mapper Directive
+//===----------------------------------------------------------------------===//
+def DeclareMapperOp : OpenMP_Op<"declare_mapper", [
+    IsolatedFromAbove,
+    RecipeInterface,
+    SingleBlock,
+    Symbol
+  ]> {
+  let summary = "declare mapper directive";
+  let description = [{
+    The declare mapper directive declares a user-defined mapper for a given
+    type, and defines a mapper-identifier that can be used in a map clause.
+  }] # clausesDescription;
+
+  let arguments = (ins  SymbolNameAttr:$sym_name,
+                        TypeAttr:$type);
+
+  let regions = (region AnyRegion:$body);
+
+  let assemblyFormat = "$sym_name `:` $type $body attr-dict";
+
+  let extraClassDeclaration = [{
+    /// Get DeclareMapperInfoOp.
+    DeclareMapperInfoOp getDeclareMapperInfo(){
+      return cast<DeclareMapperInfoOp>(getRegion().getBlocks().front().getTerminator());
+    }
+
+    /// Get SymVal block argument
+    BlockArgument getSymVal(){
+      return getRegion().getArgument(0);
+    }
+  }];
+
+  let hasRegionVerifier = 1;
+}
+
+def DeclareMapperInfoOp : OpenMP_Op<"declare_mapper.info", [
+    HasParent<"DeclareMapperOp">,
+    Terminator
+  ], clauses = [
+    OpenMP_MapClause
+  ]> {
+  let summary = "declare mapper info";
+  let description = [{
+    This Op is used to capture the map information related to it's
+    parent DeclareMapperOp.
+  }] # clausesDescription;
+
+  let builders = [
+    OpBuilder<(ins CArg<"const DeclareMapperInfoOperands &">:$clauses)>
+  ];
+
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // 2.19.5.7 declare reduction Directive
 //===----------------------------------------------------------------------===//
@@ -1861,7 +1921,7 @@ def MaskedOp : OpenMP_Op<"masked", clauses = [
   ], singleRegion = 1> {
   let summary = "masked construct";
   let description = [{
-    Masked construct allows to specify a structured block to be executed by a subset of 
+    Masked construct allows to specify a structured block to be executed by a subset of
     threads of the current team.
   }] # clausesDescription;
 
diff --git a/mlir/include/mlir/IR/CMakeLists.txt b/mlir/include/mlir/IR/CMakeLists.txt
index 0c7937dfd69e5..846547ff131e3 100644
--- a/mlir/include/mlir/IR/CMakeLists.txt
+++ b/mlir/include/mlir/IR/CMakeLists.txt
@@ -2,6 +2,8 @@ add_mlir_interface(SymbolInterfaces)
 add_mlir_interface(RegionKindInterface)
 
 set(LLVM_TARGET_DEFINITIONS OpAsmInterface.td)
+mlir_tablegen(OpAsmAttrInterface.h.inc -gen-attr-interface-decls)
+mlir_tablegen(OpAsmAttrInterface.cpp.inc -gen-attr-interface-defs)
 mlir_tablegen(OpAsmOpInterface.h.inc -gen-op-interface-decls)
 mlir_tablegen(OpAsmOpInterface.cpp.inc -gen-op-interface-defs)
 mlir_tablegen(OpAsmTypeInterface.h.inc -gen-type-interface-decls)
diff --git a/mlir/include/mlir/IR/OpAsmInterface.td b/mlir/include/mlir/IR/OpAsmInterface.td
index 34c830a12856f..1bd8eb04714c5 100644
--- a/mlir/include/mlir/IR/OpAsmInterface.td
+++ b/mlir/include/mlir/IR/OpAsmInterface.td
@@ -127,6 +127,35 @@ def OpAsmTypeInterface : TypeInterface<"OpAsmTypeInterface"> {
       "void", "getAsmName",
       (ins "::mlir::OpAsmSetNameFn":$setNameFn), "", ";"
     >,
+    InterfaceMethod<[{
+        Get a name to use when generating an alias for this type.
+      }],
+      "::mlir::OpAsmDialectInterface::AliasResult", "getAlias",
+      (ins "::llvm::raw_ostream&":$os), "",
+      "return ::mlir::OpAsmDialectInterface::AliasResult::NoAlias;"
+    >,
+  ];
+}
+
+//===----------------------------------------------------------------------===//
+// OpAsmAttrInterface
+//===----------------------------------------------------------------------===//
+
+def OpAsmAttrInterface : AttrInterface<"OpAsmAttrInterface"> {
+  let description = [{
+    This interface provides hooks to interact with the AsmPrinter and AsmParser
+    classes.
+  }];
+  let cppNamespace = "::mlir";
+
+  let methods = [
+    InterfaceMethod<[{
+        Get a name to use when generating an alias for this attribute.
+      }],
+      "::mlir::OpAsmDialectInterface::AliasResult", "getAlias",
+      (ins "::llvm::raw_ostream&":$os), "",
+      "return ::mlir::OpAsmDialectInterface::AliasResult::NoAlias;"
+    >,
   ];
 }
 
diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 5eb8b4a5cff5b..a863e881ee7c8 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -1825,6 +1825,7 @@ ParseResult parseDimensionList(OpAsmParser &parser,
 //===--------------------------------------------------------------------===//
 
 /// The OpAsmOpInterface, see OpAsmInterface.td for more details.
+#include "mlir/IR/OpAsmAttrInterface.h.inc"
 #include "mlir/IR/OpAsmOpInterface.h.inc"
 #include "mlir/IR/OpAsmTypeInterface.h.inc"
 
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index 4642d58760ca8..6c673295d8dcc 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -407,6 +407,10 @@ class ModuleImport {
   /// always requires a symbol name.
   FlatSymbolRefAttr
   getOrCreateNamelessSymbolName(llvm::GlobalVariable *globalVar);
+  /// Returns the global insertion point for the next global operation. If the
+  /// `globalInsertionOp` is set, the insertion point is placed after the
+  /// specified operation. Otherwise, it defaults to the start of the module.
+  OpBuilder::InsertionGuard setGlobalInsertionPoint();
 
   /// Builder pointing at where the next instruction should be generated.
   OpBuilder builder;
@@ -416,8 +420,6 @@ class ModuleImport {
   Operation *constantInsertionOp = nullptr;
   /// Operation to insert the next global after.
   Operation *globalInsertionOp = nullptr;
-  /// Operation to insert the next alias after.
-  Operation *aliasInsertionOp = nullptr;
   /// Operation to insert comdat selector operations into.
   ComdatOp globalComdatOp = nullptr;
   /// The current context.
diff --git a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
index ffc1a962f3811..2d56cfcf90474 100644
--- a/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
+++ b/mlir/lib/Conversion/MathToLibm/MathToLibm.cpp
@@ -184,6 +184,7 @@ void mlir::populateMathToLibmConversionPatterns(
   populatePatternsForOp<math::CosOp>(patterns, benefit, ctx, "cosf", "cos");
   populatePatternsForOp<math::CoshOp>(patterns, benefit, ctx, "coshf", "cosh");
   populatePatternsForOp<math::ErfOp>(patterns, benefit, ctx, "erff", "erf");
+  populatePatternsForOp<math::ErfcOp>(patterns, benefit, ctx, "erfcf", "erfc");
   populatePatternsForOp<math::ExpOp>(patterns, benefit, ctx, "expf", "exp");
   populatePatternsForOp<math::Exp2Op>(patterns, benefit, ctx, "exp2f", "exp2");
   populatePatternsForOp<math::ExpM1Op>(patterns, benefit, ctx, "expm1f",
diff --git a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
index 12e3c07669839..7888745dc6920 100644
--- a/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
+++ b/mlir/lib/Conversion/OpenMPToLLVM/OpenMPToLLVM.cpp
@@ -186,6 +186,32 @@ struct MapInfoOpConversion : public ConvertOpToLLVMPattern<omp::MapInfoOp> {
   }
 };
 
+struct DeclMapperOpConversion
+    : public ConvertOpToLLVMPattern<omp::DeclareMapperOp> {
+  using ConvertOpToLLVMPattern<omp::DeclareMapperOp>::ConvertOpToLLVMPattern;
+  LogicalResult
+  matchAndRewrite(omp::DeclareMapperOp curOp, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    const TypeConverter *converter = ConvertToLLVMPattern::getTypeConverter();
+    SmallVector<NamedAttribute> newAttrs;
+    newAttrs.emplace_back(curOp.getSymNameAttrName(), curOp.getSymNameAttr());
+    newAttrs.emplace_back(
+        curOp.getTypeAttrName(),
+        TypeAttr::get(converter->convertType(curOp.getType())));
+
+    auto newOp = rewriter.create<omp::DeclareMapperOp>(
+        curOp.getLoc(), TypeRange(), adaptor.getOperands(), newAttrs);
+    rewriter.inlineRegionBefore(curOp.getRegion(), newOp.getRegion(),
+                                newOp.getRegion().end());
+    if (failed(rewriter.convertRegionTypes(&newOp.getRegion(),
+                                           *this->getTypeConverter())))
+      return failure();
+
+    rewriter.eraseOp(curOp);
+    return success();
+  }
+};
+
 template <typename OpType>
 struct MultiRegionOpConversion : public ConvertOpToLLVMPattern<OpType> {
   using ConvertOpToLLVMPattern<OpType>::ConvertOpToLLVMPattern;
@@ -225,19 +251,21 @@ void mlir::configureOpenMPToLLVMConversionLegality(
     ConversionTarget &target, const LLVMTypeConverter &typeConverter) {
   target.addDynamicallyLegalOp<
       omp::AtomicReadOp, omp::AtomicWriteOp, omp::CancellationPointOp,
-      omp::CancelOp, omp::CriticalDeclareOp, omp::FlushOp, omp::MapBoundsOp,
-      omp::MapInfoOp, omp::OrderedOp, omp::ScanOp, omp::TargetEnterDataOp,
-      omp::TargetExitDataOp, omp::TargetUpdateOp, omp::ThreadprivateOp,
-      omp::YieldOp>([&](Operation *op) {
-    return typeConverter.isLegal(op->getOperandTypes()) &&
-           typeConverter.isLegal(op->getResultTypes());
-  });
+      omp::CancelOp, omp::CriticalDeclareOp, omp::DeclareMapperInfoOp,
+      omp::FlushOp, omp::MapBoundsOp, omp::MapInfoOp, omp::OrderedOp,
+      omp::ScanOp, omp::TargetEnterDataOp, omp::TargetExitDataOp,
+      omp::TargetUpdateOp, omp::ThreadprivateOp, omp::YieldOp>(
+      [&](Operation *op) {
+        return typeConverter.isLegal(op->getOperandTypes()) &&
+               typeConverter.isLegal(op->getResultTypes());
+      });
   target.addDynamicallyLegalOp<
-      omp::AtomicUpdateOp, omp::CriticalOp, omp::DeclareReductionOp,
-      omp::DistributeOp, omp::LoopNestOp, omp::LoopOp, omp::MasterOp,
-      omp::OrderedRegionOp, omp::ParallelOp, omp::SectionOp, omp::SectionsOp,
-      omp::SimdOp, omp::SingleOp, omp::TargetDataOp, omp::TargetOp,
-      omp::TaskgroupOp, omp::TaskloopOp, omp::TaskOp, omp::TeamsOp,
+      omp::AtomicUpdateOp, omp::CriticalOp, omp::DeclareMapperOp,
+      omp::DeclareReductionOp, omp::DistributeOp, omp::LoopNestOp, omp::LoopOp,
+      omp::MasterOp, omp::OrderedRegionOp, omp::ParallelOp,
+      omp::PrivateClauseOp, omp::SectionOp, omp::SectionsOp, omp::SimdOp,
+      omp::SingleOp, omp::TargetDataOp, omp::TargetOp, omp::TaskgroupOp,
+      omp::TaskloopOp, omp::TaskOp, omp::TeamsOp,
       omp::WsloopOp>([&](Operation *op) {
     return std::all_of(op->getRegions().begin(), op->getRegions().end(),
                        [&](Region &region) {
@@ -267,12 +295,13 @@ void mlir::populateOpenMPToLLVMConversionPatterns(LLVMTypeConverter &converter,
       [&](omp::MapBoundsType type) -> Type { return type; });
 
   patterns.add<
-      AtomicReadOpConversion, MapInfoOpConversion,
+      AtomicReadOpConversion, DeclMapperOpConversion, MapInfoOpConversion,
       MultiRegionOpConversion<omp::DeclareReductionOp>,
       MultiRegionOpConversion<omp::PrivateClauseOp>,
       RegionLessOpConversion<omp::CancellationPointOp>,
       RegionLessOpConversion<omp::CancelOp>,
       RegionLessOpConversion<omp::CriticalDeclareOp>,
+      RegionLessOpConversion<omp::DeclareMapperInfoOp>,
       RegionLessOpConversion<omp::OrderedOp>,
       RegionLessOpConversion<omp::ScanOp>,
       RegionLessOpConversion<omp::TargetEnterDataOp>,
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index 1bdeb3e356f4b..976432ea37120 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -2591,6 +2591,18 @@ TargetOptions::tokenizeCmdOptions() const {
   return tokenizeCmdOptions(cmdOptions);
 }
 
+std::pair<llvm::BumpPtrAllocator, SmallVector<const char *>>
+TargetOptions::tokenizeAndRemoveSuffixCmdOptions(llvm::StringRef startsWith) {
+  size_t startPos = cmdOptions.find(startsWith);
+  if (startPos == std::string::npos)
+    return {llvm::BumpPtrAllocator(), SmallVector<const char *>()};
+
+  auto tokenized =
+      tokenizeCmdOptions(cmdOptions.substr(startPos + startsWith.size()));
+  cmdOptions.resize(startPos);
+  return tokenized;
+}
+
 MLIR_DEFINE_EXPLICIT_TYPE_ID(::mlir::gpu::TargetOptions)
 
 #include "mlir/Dialect/GPU/IR/GPUOpInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/Math/IR/MathOps.cpp b/mlir/lib/Dialect/Math/IR/MathOps.cpp
index 42e357c012739..9c4d88e2191ce 100644
--- a/mlir/lib/Dialect/Math/IR/MathOps.cpp
+++ b/mlir/lib/Dialect/Math/IR/MathOps.cpp
@@ -332,6 +332,24 @@ OpFoldResult math::ErfOp::fold(FoldAdaptor adaptor) {
       });
 }
 
+//===----------------------------------------------------------------------===//
+// ErfcOp folder
+//===----------------------------------------------------------------------===//
+
+OpFoldResult math::ErfcOp::fold(FoldAdaptor adaptor) {
+  return constFoldUnaryOpConditional<FloatAttr>(
+      adaptor.getOperands(), [](const APFloat &a) -> std::optional<APFloat> {
+        switch (APFloat::SemanticsToEnum(a.getSemantics())) {
+        case APFloat::Semantics::S_IEEEdouble:
+          return APFloat(erfc(a.convertToDouble()));
+        case APFloat::Semantics::S_IEEEsingle:
+          return APFloat(erfcf(a.convertToFloat()));
+        default:
+          return {};
+        }
+      });
+}
+
 //===----------------------------------------------------------------------===//
 // IPowIOp folder
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index 777427de9465c..167eebd786dba 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -173,6 +173,10 @@ handleMultidimensionalVectors(ImplicitLocOpBuilder &builder,
 // Helper functions to create constants.
 //----------------------------------------------------------------------------//
 
+static Value boolCst(ImplicitLocOpBuilder &builder, bool value) {
+  return builder.create<arith::ConstantOp>(builder.getBoolAttr(value));
+}
+
 static Value floatCst(ImplicitLocOpBuilder &builder, float value,
                       Type elementType) {
   assert((elementType.isF16() || elementType.isF32()) &&
@@ -1118,6 +1122,103 @@ ErfPolynomialApproximation::matchAndRewrite(math::ErfOp op,
   return success();
 }
 
+// Approximates erfc(x) with p((x - 2) / (x + 2)), where p is a 9 degree
+// polynomial.This approximation is based on the following stackoverflow post:
+// https://stackoverflow.com/questions/35966695/vectorizable-implementation-of-complementary-error-function-erfcf
+// The stackoverflow post is in turn based on:
+// M. M. Shepherd and J. G. Laframboise, "Chebyshev Approximation of
+// (1+2x)exp(x^2)erfc x in 0 <= x < INF", Mathematics of Computation, Vol. 36,
+// No. 153, January 1981, pp. 249-253.
+//
+// Maximum error: 2.65 ulps
+LogicalResult
+ErfcPolynomialApproximation::matchAndRewrite(math::ErfcOp op,
+                                             PatternRewriter &rewriter) const {
+  Value x = op.getOperand();
+  Type et = getElementTypeOrSelf(x);
+
+  if (!et.isF32())
+    return rewriter.notifyMatchFailure(op, "only f32 type is supported.");
+  std::optional<VectorShape> shape = vectorShape(x);
+
+  ImplicitLocOpBuilder builder(op->getLoc(), rewriter);
+  auto bcast = [&](Value value) -> Value {
+    return broadcast(builder, value, shape);
+  };
+
+  Value trueValue = bcast(boolCst(builder, true));
+  Value zero = bcast(floatCst(builder, 0.0f, et));
+  Value one = bcast(floatCst(builder, 1.0f, et));
+  Value onehalf = bcast(floatCst(builder, 0.5f, et));
+  Value neg4 = bcast(floatCst(builder, -4.0f, et));
+  Value neg2 = bcast(floatCst(builder, -2.0f, et));
+  Value pos2 = bcast(floatCst(builder, 2.0f, et));
+  Value posInf = bcast(floatCst(builder, INFINITY, et));
+  Value clampVal = bcast(floatCst(builder, 10.0546875f, et));
+
+  Value a = builder.create<math::AbsFOp>(x);
+  Value p = builder.create<arith::AddFOp>(a, pos2);
+  Value r = builder.create<arith::DivFOp>(one, p);
+  Value q = builder.create<math::FmaOp>(neg4, r, one);
+  Value t = builder.create<math::FmaOp>(builder.create<arith::AddFOp>(q, one),
+                                        neg2, a);
+  Value e = builder.create<math::FmaOp>(builder.create<arith::NegFOp>(a), q, t);
+  q = builder.create<math::FmaOp>(r, e, q);
+
+  p = bcast(floatCst(builder, -0x1.a4a000p-12f, et));        // -4.01139259e-4
+  Value c1 = bcast(floatCst(builder, -0x1.42a260p-10f, et)); // -1.23075210e-3
+  p = builder.create<math::FmaOp>(p, q, c1);
+  Value c2 = bcast(floatCst(builder, 0x1.585714p-10f, et)); //  1.31355342e-3
+  p = builder.create<math::FmaOp>(p, q, c2);
+  Value c3 = bcast(floatCst(builder, 0x1.1adcc4p-07f, et)); // 8.63227434e-3
+  p = builder.create<math::FmaOp>(p, q, c3);
+  Value c4 = bcast(floatCst(builder, -0x1.081b82p-07f, et)); // -8.05991981e-3
+  p = builder.create<math::FmaOp>(p, q, c4);
+  Value c5 = bcast(floatCst(builder, -0x1.bc0b6ap-05f, et)); // -5.42046614e-2
+  p = builder.create<math::FmaOp>(p, q, c5);
+  Value c6 = bcast(floatCst(builder, 0x1.4ffc46p-03f, et)); //  1.64055392e-1
+  p = builder.create<math::FmaOp>(p, q, c6);
+  Value c7 = bcast(floatCst(builder, -0x1.540840p-03f, et)); // -1.66031361e-1
+  p = builder.create<math::FmaOp>(p, q, c7);
+  Value c8 = bcast(floatCst(builder, -0x1.7bf616p-04f, et)); // -9.27639827e-2
+  p = builder.create<math::FmaOp>(p, q, c8);
+  Value c9 = bcast(floatCst(builder, 0x1.1ba03ap-02f, et)); // 2.76978403e-1
+  p = builder.create<math::FmaOp>(p, q, c9);
+
+  Value d = builder.create<math::FmaOp>(pos2, a, one);
+  r = builder.create<arith::DivFOp>(one, d);
+  q = builder.create<math::FmaOp>(p, r, r);
+  Value negfa = builder.create<arith::NegFOp>(a);
+  Value fmaqah = builder.create<math::FmaOp>(q, negfa, onehalf);
+  Value psubq = builder.create<arith::SubFOp>(p, q);
+  e = builder.create<math::FmaOp>(fmaqah, pos2, psubq);
+  r = builder.create<math::FmaOp>(e, r, q);
+
+  Value s = builder.create<arith::MulFOp>(a, a);
+  e = builder.create<math::ExpOp>(builder.create<arith::NegFOp>(s));
+
+  t = builder.create<math::FmaOp>(builder.create<arith::NegFOp>(a), a, s);
+  r = builder.create<math::FmaOp>(
+      r, e,
+      builder.create<arith::MulFOp>(builder.create<arith::MulFOp>(r, e), t));
+
+  Value isNotLessThanInf = builder.create<arith::XOrIOp>(
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, a, posInf),
+      trueValue);
+  r = builder.create<arith::SelectOp>(isNotLessThanInf,
+                                      builder.create<arith::AddFOp>(x, x), r);
+  Value isGreaterThanClamp =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OGT, a, clampVal);
+  r = builder.create<arith::SelectOp>(isGreaterThanClamp, zero, r);
+
+  Value isNegative =
+      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OLT, x, zero);
+  r = builder.create<arith::SelectOp>(
+      isNegative, builder.create<arith::SubFOp>(pos2, r), r);
+
+  rewriter.replaceOp(op, r);
+  return success();
+}
 //----------------------------------------------------------------------------//
 // Exp approximation.
 //----------------------------------------------------------------------------//
@@ -1667,6 +1768,11 @@ void mlir::populatePolynomialApproximateErfPattern(
   patterns.add<ErfPolynomialApproximation>(patterns.getContext());
 }
 
+void mlir::populatePolynomialApproximateErfcPattern(
+    RewritePatternSet &patterns) {
+  patterns.add<ErfcPolynomialApproximation>(patterns.getContext());
+}
+
 template <typename OpType>
 static void
 populateMathF32ExpansionPattern(RewritePatternSet &patterns,
@@ -1690,6 +1796,7 @@ void mlir::populateMathF32ExpansionPatterns(
   populateMathF32ExpansionPattern<math::CosOp>(patterns, predicate);
   populateMathF32ExpansionPattern<math::CoshOp>(patterns, predicate);
   populateMathF32ExpansionPattern<math::ErfOp>(patterns, predicate);
+  populateMathF32ExpansionPattern<math::ErfcOp>(patterns, predicate);
   populateMathF32ExpansionPattern<math::ExpOp>(patterns, predicate);
   populateMathF32ExpansionPattern<math::Exp2Op>(patterns, predicate);
   populateMathF32ExpansionPattern<math::ExpM1Op>(patterns, predicate);
@@ -1734,6 +1841,9 @@ void mlir::populateMathPolynomialApproximationPatterns(
       CosOp, SinAndCosApproximation<false, math::CosOp>>(patterns, predicate);
   populateMathPolynomialApproximationPattern<ErfOp, ErfPolynomialApproximation>(
       patterns, predicate);
+  populateMathPolynomialApproximationPattern<ErfcOp,
+                                             ErfcPolynomialApproximation>(
+      patterns, predicate);
   populateMathPolynomialApproximationPattern<ExpOp, ExpApproximation>(
       patterns, predicate);
   populateMathPolynomialApproximationPattern<ExpM1Op, ExpM1Approximation>(
@@ -1760,9 +1870,10 @@ void mlir::populateMathPolynomialApproximationPatterns(
         {math::AtanOp::getOperationName(), math::Atan2Op::getOperationName(),
          math::TanhOp::getOperationName(), math::LogOp::getOperationName(),
          math::Log2Op::getOperationName(), math::Log1pOp::getOperationName(),
-         math::ErfOp::getOperationName(), math::ExpOp::getOperationName(),
-         math::ExpM1Op::getOperationName(), math::CbrtOp::getOperationName(),
-         math::SinOp::getOperationName(), math::CosOp::getOperationName()},
+         math::ErfOp::getOperationName(), math::ErfcOp::getOperationName(),
+         math::ExpOp::getOperationName(), math::ExpM1Op::getOperationName(),
+         math::CbrtOp::getOperationName(), math::SinOp::getOperationName(),
+         math::CosOp::getOperationName()},
         name);
   });
 
@@ -1774,8 +1885,9 @@ void mlir::populateMathPolynomialApproximationPatterns(
              math::TanhOp::getOperationName(), math::LogOp::getOperationName(),
              math::Log2Op::getOperationName(),
              math::Log1pOp::getOperationName(), math::ErfOp::getOperationName(),
-             math::AsinOp::getOperationName(), math::AcosOp::getOperationName(),
-             math::ExpOp::getOperationName(), math::ExpM1Op::getOperationName(),
+             math::ErfcOp::getOperationName(), math::AsinOp::getOperationName(),
+             math::AcosOp::getOperationName(), math::ExpOp::getOperationName(),
+             math::ExpM1Op::getOperationName(),
              math::CbrtOp::getOperationName(), math::SinOp::getOperationName(),
              math::CosOp::getOperationName()},
             name);
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 5ec840e7fef81..62e1c4c3ed3b1 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -32,6 +32,7 @@
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Frontend/OpenMP/OMPConstants.h"
 #include "llvm/Frontend/OpenMP/OMPDeviceConstants.h"
+#include "llvm/Support/Casting.h"
 #include <cstddef>
 #include <iterator>
 #include <optional>
@@ -1631,7 +1632,13 @@ static LogicalResult verifyMapClause(Operation *op, OperandRange mapVars) {
 
         to ? updateToVars.insert(updateVar) : updateFromVars.insert(updateVar);
       }
-    } else {
+
+      if (mapInfoOp.getMapperId() &&
+          !SymbolTable::lookupNearestSymbolFrom<omp::DeclareMapperOp>(
+              mapInfoOp, mapInfoOp.getMapperIdAttr())) {
+        return emitError(op->getLoc(), "invalid mapper id");
+      }
+    } else if (!isa<DeclareMapperInfoOp>(op)) {
       emitError(op->getLoc(), "map argument is not a map entry operation");
     }
   }
@@ -2432,6 +2439,22 @@ LogicalResult DistributeOp::verifyRegions() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// DeclareMapperOp / DeclareMapperInfoOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult DeclareMapperInfoOp::verify() {
+  return verifyMapClause(*this, getMapVars());
+}
+
+LogicalResult DeclareMapperOp::verifyRegions() {
+  if (!llvm::isa_and_present<DeclareMapperInfoOp>(
+          getRegion().getBlocks().front().getTerminator()))
+    return emitOpError() << "expected terminator to be a DeclareMapperInfoOp";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // DeclareReductionOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 730662fee7db4..846dbfe3e5e8c 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -125,6 +125,7 @@ void OpAsmPrinter::printFunctionalType(Operation *op) {
 //===----------------------------------------------------------------------===//
 
 /// The OpAsmOpInterface, see OpAsmInterface.td for more details.
+#include "mlir/IR/OpAsmAttrInterface.cpp.inc"
 #include "mlir/IR/OpAsmOpInterface.cpp.inc"
 #include "mlir/IR/OpAsmTypeInterface.cpp.inc"
 
@@ -1190,15 +1191,30 @@ template <typename T>
 void AliasInitializer::generateAlias(T symbol, InProgressAliasInfo &alias,
                                      bool canBeDeferred) {
   SmallString<32> nameBuffer;
-  for (const auto &interface : interfaces) {
-    OpAsmDialectInterface::AliasResult result =
-        interface.getAlias(symbol, aliasOS);
-    if (result == OpAsmDialectInterface::AliasResult::NoAlias)
-      continue;
-    nameBuffer = std::move(aliasBuffer);
-    assert(!nameBuffer.empty() && "expected valid alias name");
-    if (result == OpAsmDialectInterface::AliasResult::FinalAlias)
-      break;
+
+  OpAsmDialectInterface::AliasResult symbolInterfaceResult =
+      OpAsmDialectInterface::AliasResult::NoAlias;
+  using InterfaceT = std::conditional_t<std::is_base_of_v<Attribute, T>,
+                                        OpAsmAttrInterface, OpAsmTypeInterface>;
+  if (auto symbolInterface = dyn_cast<InterfaceT>(symbol)) {
+    symbolInterfaceResult = symbolInterface.getAlias(aliasOS);
+    if (symbolInterfaceResult != OpAsmDialectInterface::AliasResult::NoAlias) {
+      nameBuffer = std::move(aliasBuffer);
+      assert(!nameBuffer.empty() && "expected valid alias name");
+    }
+  }
+
+  if (symbolInterfaceResult != OpAsmDialectInterface::AliasResult::FinalAlias) {
+    for (const auto &interface : interfaces) {
+      OpAsmDialectInterface::AliasResult result =
+          interface.getAlias(symbol, aliasOS);
+      if (result == OpAsmDialectInterface::AliasResult::NoAlias)
+        continue;
+      nameBuffer = std::move(aliasBuffer);
+      assert(!nameBuffer.empty() && "expected valid alias name");
+      if (result == OpAsmDialectInterface::AliasResult::FinalAlias)
+        break;
+    }
   }
 
   if (nameBuffer.empty())
@@ -1567,10 +1583,13 @@ StringRef maybeGetValueNameFromLoc(Value value, StringRef name) {
 } // namespace
 
 void SSANameState::numberValuesInRegion(Region &region) {
+  // Indicates whether OpAsmOpInterface set a name.
+  bool opAsmOpInterfaceUsed = false;
   auto setBlockArgNameFn = [&](Value arg, StringRef name) {
     assert(!valueIDs.count(arg) && "arg numbered multiple times");
     assert(llvm::cast<BlockArgument>(arg).getOwner()->getParent() == &region &&
            "arg not defined in current region");
+    opAsmOpInterfaceUsed = true;
     if (LLVM_UNLIKELY(printerFlags.shouldUseNameLocAsPrefix()))
       name = maybeGetValueNameFromLoc(arg, name);
     setValueName(arg, name);
@@ -1580,6 +1599,15 @@ void SSANameState::numberValuesInRegion(Region &region) {
     if (Operation *op = region.getParentOp()) {
       if (auto asmInterface = dyn_cast<OpAsmOpInterface>(op))
         asmInterface.getAsmBlockArgumentNames(region, setBlockArgNameFn);
+      // If the OpAsmOpInterface didn't set a name, get name from the type.
+      if (!opAsmOpInterfaceUsed) {
+        for (BlockArgument arg : region.getArguments()) {
+          if (auto interface = dyn_cast<OpAsmTypeInterface>(arg.getType())) {
+            interface.getAsmName(
+                [&](StringRef name) { setBlockArgNameFn(arg, name); });
+          }
+        }
+      }
     }
   }
 
@@ -1629,9 +1657,12 @@ void SSANameState::numberValuesInBlock(Block &block) {
 void SSANameState::numberValuesInOp(Operation &op) {
   // Function used to set the special result names for the operation.
   SmallVector<int, 2> resultGroups(/*Size=*/1, /*Value=*/0);
+  // Indicates whether OpAsmOpInterface set a name.
+  bool opAsmOpInterfaceUsed = false;
   auto setResultNameFn = [&](Value result, StringRef name) {
     assert(!valueIDs.count(result) && "result numbered multiple times");
     assert(result.getDefiningOp() == &op && "result not defined by 'op'");
+    opAsmOpInterfaceUsed = true;
     if (LLVM_UNLIKELY(printerFlags.shouldUseNameLocAsPrefix()))
       name = maybeGetValueNameFromLoc(result, name);
     setValueName(result, name);
@@ -1661,6 +1692,21 @@ void SSANameState::numberValuesInOp(Operation &op) {
       asmInterface.getAsmBlockNames(setBlockNameFn);
       asmInterface.getAsmResultNames(setResultNameFn);
     }
+    if (!opAsmOpInterfaceUsed) {
+      // If the OpAsmOpInterface didn't set a name, and all results have
+      // OpAsmTypeInterface, get names from types.
+      bool allHaveOpAsmTypeInterface =
+          llvm::all_of(op.getResultTypes(), [&](Type type) {
+            return isa<OpAsmTypeInterface>(type);
+          });
+      if (allHaveOpAsmTypeInterface) {
+        for (OpResult result : op.getResults()) {
+          auto interface = cast<OpAsmTypeInterface>(result.getType());
+          interface.getAsmName(
+              [&](StringRef name) { setResultNameFn(result, name); });
+        }
+      }
+    }
   }
 
   unsigned numResults = op.getNumResults();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 51a3cbdbb5e7f..b9d88a68410ee 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -2809,13 +2809,23 @@ getRefPtrIfDeclareTarget(mlir::Value value,
 }
 
 namespace {
+// Append customMappers information to existing MapInfosTy
+struct MapInfosTy : llvm::OpenMPIRBuilder::MapInfosTy {
+  SmallVector<Operation *, 4> Mappers;
+
+  /// Append arrays in \a CurInfo.
+  void append(MapInfosTy &curInfo) {
+    Mappers.append(curInfo.Mappers.begin(), curInfo.Mappers.end());
+    llvm::OpenMPIRBuilder::MapInfosTy::append(curInfo);
+  }
+};
 // A small helper structure to contain data gathered
 // for map lowering and coalese it into one area and
 // avoiding extra computations such as searches in the
 // llvm module for lowered mapped variables or checking
 // if something is declare target (and retrieving the
 // value) more than neccessary.
-struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
+struct MapInfoData : MapInfosTy {
   llvm::SmallVector<bool, 4> IsDeclareTarget;
   llvm::SmallVector<bool, 4> IsAMember;
   // Identify if mapping was added by mapClause or use_device clauses.
@@ -2834,7 +2844,7 @@ struct MapInfoData : llvm::OpenMPIRBuilder::MapInfosTy {
     OriginalValue.append(CurInfo.OriginalValue.begin(),
                          CurInfo.OriginalValue.end());
     BaseType.append(CurInfo.BaseType.begin(), CurInfo.BaseType.end());
-    llvm::OpenMPIRBuilder::MapInfosTy::append(CurInfo);
+    MapInfosTy::append(CurInfo);
   }
 };
 } // namespace
@@ -2955,6 +2965,12 @@ static void collectMapDataFromMapOperands(
     mapData.Names.push_back(LLVM::createMappingInformation(
         mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder()));
     mapData.DevicePointers.push_back(llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+    if (mapOp.getMapperId())
+      mapData.Mappers.push_back(
+          SymbolTable::lookupNearestSymbolFrom<omp::DeclareMapperOp>(
+              mapOp, mapOp.getMapperIdAttr()));
+    else
+      mapData.Mappers.push_back(nullptr);
     mapData.IsAMapping.push_back(true);
     mapData.IsAMember.push_back(checkIsAMember(mapVars, mapOp));
   }
@@ -2999,6 +3015,7 @@ static void collectMapDataFromMapOperands(
         mapData.Names.push_back(LLVM::createMappingInformation(
             mapOp.getLoc(), *moduleTranslation.getOpenMPBuilder()));
         mapData.DevicePointers.push_back(devInfoTy);
+        mapData.Mappers.push_back(nullptr);
         mapData.IsAMapping.push_back(false);
         mapData.IsAMember.push_back(checkIsAMember(useDevOperands, mapOp));
       }
@@ -3164,9 +3181,8 @@ calculateBoundsOffset(LLVM::ModuleTranslation &moduleTranslation,
 // inside of CGOpenMPRuntime.cpp
 static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
     LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder,
-    llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl,
-    llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo, MapInfoData &mapData,
-    uint64_t mapDataIndex, bool isTargetParams) {
+    llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl, MapInfosTy &combinedInfo,
+    MapInfoData &mapData, uint64_t mapDataIndex, bool isTargetParams) {
   // Map the first segment of our structure
   combinedInfo.Types.emplace_back(
       isTargetParams
@@ -3174,6 +3190,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
           : llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_NONE);
   combinedInfo.DevicePointers.emplace_back(
       mapData.DevicePointers[mapDataIndex]);
+  combinedInfo.Mappers.emplace_back(mapData.Mappers[mapDataIndex]);
   combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
       mapData.MapClause[mapDataIndex]->getLoc(), ompBuilder));
   combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
@@ -3237,6 +3254,7 @@ static llvm::omp::OpenMPOffloadMappingFlags mapParentWithMembers(
     combinedInfo.Types.emplace_back(mapFlag);
     combinedInfo.DevicePointers.emplace_back(
         llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+    combinedInfo.Mappers.emplace_back(nullptr);
     combinedInfo.Names.emplace_back(LLVM::createMappingInformation(
         mapData.MapClause[mapDataIndex]->getLoc(), ompBuilder));
     combinedInfo.BasePointers.emplace_back(mapData.BasePointers[mapDataIndex]);
@@ -3270,9 +3288,9 @@ static bool checkIfPointerMap(omp::MapInfoOp mapOp) {
 // This function is intended to add explicit mappings of members
 static void processMapMembersWithParent(
     LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder,
-    llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl,
-    llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo, MapInfoData &mapData,
-    uint64_t mapDataIndex, llvm::omp::OpenMPOffloadMappingFlags memberOfFlag) {
+    llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl, MapInfosTy &combinedInfo,
+    MapInfoData &mapData, uint64_t mapDataIndex,
+    llvm::omp::OpenMPOffloadMappingFlags memberOfFlag) {
 
   auto parentClause =
       llvm::cast<omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
@@ -3300,6 +3318,7 @@ static void processMapMembersWithParent(
       combinedInfo.Types.emplace_back(mapFlag);
       combinedInfo.DevicePointers.emplace_back(
           llvm::OpenMPIRBuilder::DeviceInfoTy::None);
+      combinedInfo.Mappers.emplace_back(nullptr);
       combinedInfo.Names.emplace_back(
           LLVM::createMappingInformation(memberClause.getLoc(), ompBuilder));
       combinedInfo.BasePointers.emplace_back(
@@ -3322,6 +3341,7 @@ static void processMapMembersWithParent(
     combinedInfo.Types.emplace_back(mapFlag);
     combinedInfo.DevicePointers.emplace_back(
         mapData.DevicePointers[memberDataIdx]);
+    combinedInfo.Mappers.emplace_back(mapData.Mappers[memberDataIdx]);
     combinedInfo.Names.emplace_back(
         LLVM::createMappingInformation(memberClause.getLoc(), ompBuilder));
     uint64_t basePointerIndex =
@@ -3341,10 +3361,9 @@ static void processMapMembersWithParent(
   }
 }
 
-static void
-processIndividualMap(MapInfoData &mapData, size_t mapDataIdx,
-                     llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
-                     bool isTargetParams, int mapDataParentIdx = -1) {
+static void processIndividualMap(MapInfoData &mapData, size_t mapDataIdx,
+                                 MapInfosTy &combinedInfo, bool isTargetParams,
+                                 int mapDataParentIdx = -1) {
   // Declare Target Mappings are excluded from being marked as
   // OMP_MAP_TARGET_PARAM as they are not passed as parameters, they're
   // marked with OMP_MAP_PTR_AND_OBJ instead.
@@ -3374,16 +3393,18 @@ processIndividualMap(MapInfoData &mapData, size_t mapDataIdx,
 
   combinedInfo.Pointers.emplace_back(mapData.Pointers[mapDataIdx]);
   combinedInfo.DevicePointers.emplace_back(mapData.DevicePointers[mapDataIdx]);
+  combinedInfo.Mappers.emplace_back(mapData.Mappers[mapDataIdx]);
   combinedInfo.Names.emplace_back(mapData.Names[mapDataIdx]);
   combinedInfo.Types.emplace_back(mapFlag);
   combinedInfo.Sizes.emplace_back(mapData.Sizes[mapDataIdx]);
 }
 
-static void processMapWithMembersOf(
-    LLVM::ModuleTranslation &moduleTranslation, llvm::IRBuilderBase &builder,
-    llvm::OpenMPIRBuilder &ompBuilder, DataLayout &dl,
-    llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo, MapInfoData &mapData,
-    uint64_t mapDataIndex, bool isTargetParams) {
+static void processMapWithMembersOf(LLVM::ModuleTranslation &moduleTranslation,
+                                    llvm::IRBuilderBase &builder,
+                                    llvm::OpenMPIRBuilder &ompBuilder,
+                                    DataLayout &dl, MapInfosTy &combinedInfo,
+                                    MapInfoData &mapData, uint64_t mapDataIndex,
+                                    bool isTargetParams) {
   auto parentClause =
       llvm::cast<omp::MapInfoOp>(mapData.MapClause[mapDataIndex]);
 
@@ -3488,8 +3509,7 @@ createAlteredByCaptureMap(MapInfoData &mapData,
 // Generate all map related information and fill the combinedInfo.
 static void genMapInfos(llvm::IRBuilderBase &builder,
                         LLVM::ModuleTranslation &moduleTranslation,
-                        DataLayout &dl,
-                        llvm::OpenMPIRBuilder::MapInfosTy &combinedInfo,
+                        DataLayout &dl, MapInfosTy &combinedInfo,
                         MapInfoData &mapData, bool isTargetParams = false) {
   // We wish to modify some of the methods in which arguments are
   // passed based on their capture type by the target region, this can
@@ -3529,6 +3549,78 @@ static void genMapInfos(llvm::IRBuilderBase &builder,
   }
 }
 
+static llvm::Expected<llvm::Function *>
+emitUserDefinedMapper(Operation *declMapperOp, llvm::IRBuilderBase &builder,
+                      LLVM::ModuleTranslation &moduleTranslation,
+                      llvm::StringRef mapperFuncName);
+
+static llvm::Expected<llvm::Function *>
+getOrCreateUserDefinedMapperFunc(Operation *op, llvm::IRBuilderBase &builder,
+                                 LLVM::ModuleTranslation &moduleTranslation) {
+  auto declMapperOp = cast<omp::DeclareMapperOp>(op);
+  std::string mapperFuncName =
+      moduleTranslation.getOpenMPBuilder()->createPlatformSpecificName(
+          {"omp_mapper", declMapperOp.getSymName()});
+
+  if (auto *lookupFunc = moduleTranslation.lookupFunction(mapperFuncName))
+    return lookupFunc;
+
+  return emitUserDefinedMapper(declMapperOp, builder, moduleTranslation,
+                               mapperFuncName);
+}
+
+static llvm::Expected<llvm::Function *>
+emitUserDefinedMapper(Operation *op, llvm::IRBuilderBase &builder,
+                      LLVM::ModuleTranslation &moduleTranslation,
+                      llvm::StringRef mapperFuncName) {
+  auto declMapperOp = cast<omp::DeclareMapperOp>(op);
+  auto declMapperInfoOp = declMapperOp.getDeclareMapperInfo();
+  DataLayout dl = DataLayout(declMapperOp->getParentOfType<ModuleOp>());
+  llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
+  llvm::Type *varType = moduleTranslation.convertType(declMapperOp.getType());
+  SmallVector<Value> mapVars = declMapperInfoOp.getMapVars();
+
+  using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
+
+  // Fill up the arrays with all the mapped variables.
+  MapInfosTy combinedInfo;
+  auto genMapInfoCB =
+      [&](InsertPointTy codeGenIP, llvm::Value *ptrPHI,
+          llvm::Value *unused2) -> llvm::OpenMPIRBuilder::MapInfosOrErrorTy {
+    builder.restoreIP(codeGenIP);
+    moduleTranslation.mapValue(declMapperOp.getSymVal(), ptrPHI);
+    moduleTranslation.mapBlock(&declMapperOp.getRegion().front(),
+                               builder.GetInsertBlock());
+    if (failed(moduleTranslation.convertBlock(declMapperOp.getRegion().front(),
+                                              /*ignoreArguments=*/true,
+                                              builder)))
+      return llvm::make_error<PreviouslyReportedError>();
+    MapInfoData mapData;
+    collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, dl,
+                                  builder);
+    genMapInfos(builder, moduleTranslation, dl, combinedInfo, mapData);
+
+    // Drop the mapping that is no longer necessary so that the same region can
+    // be processed multiple times.
+    moduleTranslation.forgetMapping(declMapperOp.getRegion());
+    return combinedInfo;
+  };
+
+  auto customMapperCB = [&](unsigned i) -> llvm::Expected<llvm::Function *> {
+    if (!combinedInfo.Mappers[i])
+      return nullptr;
+    return getOrCreateUserDefinedMapperFunc(combinedInfo.Mappers[i], builder,
+                                            moduleTranslation);
+  };
+
+  llvm::Expected<llvm::Function *> newFn = ompBuilder->emitUserDefinedMapper(
+      genMapInfoCB, varType, mapperFuncName, customMapperCB);
+  if (!newFn)
+    return newFn.takeError();
+  moduleTranslation.mapFunction(mapperFuncName, *newFn);
+  return *newFn;
+}
+
 static LogicalResult
 convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                      LLVM::ModuleTranslation &moduleTranslation) {
@@ -3640,9 +3732,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
                                 builder, useDevicePtrVars, useDeviceAddrVars);
 
   // Fill up the arrays with all the mapped variables.
-  llvm::OpenMPIRBuilder::MapInfosTy combinedInfo;
-  auto genMapInfoCB =
-      [&](InsertPointTy codeGenIP) -> llvm::OpenMPIRBuilder::MapInfosTy & {
+  MapInfosTy combinedInfo;
+  auto genMapInfoCB = [&](InsertPointTy codeGenIP) -> MapInfosTy & {
     builder.restoreIP(codeGenIP);
     genMapInfos(builder, moduleTranslation, DL, combinedInfo, mapData);
     return combinedInfo;
@@ -3685,6 +3776,7 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
   using BodyGenTy = llvm::OpenMPIRBuilder::BodyGenTy;
   auto bodyGenCB = [&](InsertPointTy codeGenIP, BodyGenTy bodyGenType)
       -> llvm::OpenMPIRBuilder::InsertPointOrErrorTy {
+    builder.restoreIP(codeGenIP);
     assert(isa<omp::TargetDataOp>(op) &&
            "BodyGen requested for non TargetDataOp");
     auto blockArgIface = cast<omp::BlockArgOpenMPOpInterface>(op);
@@ -3693,8 +3785,6 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
     case BodyGenTy::Priv:
       // Check if any device ptr/addr info is available
       if (!info.DevicePtrInfoMap.empty()) {
-        builder.restoreIP(codeGenIP);
-
         mapUseDevice(llvm::OpenMPIRBuilder::DeviceInfoTy::Address,
                      blockArgIface.getUseDeviceAddrBlockArgs(),
                      useDeviceAddrVars, mapData,
@@ -3724,7 +3814,6 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
     case BodyGenTy::NoPriv:
       // If device info is available then region has already been generated
       if (info.DevicePtrInfoMap.empty()) {
-        builder.restoreIP(codeGenIP);
         // For device pass, if use_device_ptr(addr) mappings were present,
         // we need to link them here before codegen.
         if (ompBuilder->Config.IsTargetDevice.value_or(false)) {
@@ -3745,17 +3834,28 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
     return builder.saveIP();
   };
 
+  auto customMapperCB =
+      [&](unsigned int i) -> llvm::Expected<llvm::Function *> {
+    if (!combinedInfo.Mappers[i])
+      return nullptr;
+    info.HasMapper = true;
+    return getOrCreateUserDefinedMapperFunc(combinedInfo.Mappers[i], builder,
+                                            moduleTranslation);
+  };
+
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
   llvm::OpenMPIRBuilder::InsertPointTy allocaIP =
       findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP = [&]() {
     if (isa<omp::TargetDataOp>(op))
-      return ompBuilder->createTargetData(
-          ompLoc, allocaIP, builder.saveIP(), builder.getInt64(deviceID),
-          ifCond, info, genMapInfoCB, nullptr, bodyGenCB);
-    return ompBuilder->createTargetData(ompLoc, allocaIP, builder.saveIP(),
-                                        builder.getInt64(deviceID), ifCond,
-                                        info, genMapInfoCB, &RTLFn);
+      return ompBuilder->createTargetData(ompLoc, allocaIP, builder.saveIP(),
+                                          builder.getInt64(deviceID), ifCond,
+                                          info, genMapInfoCB, customMapperCB,
+                                          /*MapperFunc=*/nullptr, bodyGenCB,
+                                          /*DeviceAddrCB=*/nullptr);
+    return ompBuilder->createTargetData(
+        ompLoc, allocaIP, builder.saveIP(), builder.getInt64(deviceID), ifCond,
+        info, genMapInfoCB, customMapperCB, &RTLFn);
   }();
 
   if (failed(handleError(afterIP, *op)))
@@ -4367,9 +4467,9 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
   collectMapDataFromMapOperands(mapData, mapVars, moduleTranslation, dl,
                                 builder);
 
-  llvm::OpenMPIRBuilder::MapInfosTy combinedInfos;
-  auto genMapInfoCB = [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP)
-      -> llvm::OpenMPIRBuilder::MapInfosTy & {
+  MapInfosTy combinedInfos;
+  auto genMapInfoCB =
+      [&](llvm::OpenMPIRBuilder::InsertPointTy codeGenIP) -> MapInfosTy & {
     builder.restoreIP(codeGenIP);
     genMapInfos(builder, moduleTranslation, dl, combinedInfos, mapData, true);
     return combinedInfos;
@@ -4438,15 +4538,28 @@ convertOmpTarget(Operation &opInst, llvm::IRBuilderBase &builder,
       findAllocaInsertPoint(builder, moduleTranslation);
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
+  llvm::OpenMPIRBuilder::TargetDataInfo info(
+      /*RequiresDevicePointerInfo=*/false,
+      /*SeparateBeginEndCalls=*/true);
+
+  auto customMapperCB =
+      [&](unsigned int i) -> llvm::Expected<llvm::Function *> {
+    if (!combinedInfos.Mappers[i])
+      return nullptr;
+    info.HasMapper = true;
+    return getOrCreateUserDefinedMapperFunc(combinedInfos.Mappers[i], builder,
+                                            moduleTranslation);
+  };
+
   llvm::Value *ifCond = nullptr;
   if (Value targetIfCond = targetOp.getIfExpr())
     ifCond = moduleTranslation.lookupValue(targetIfCond);
 
   llvm::OpenMPIRBuilder::InsertPointOrErrorTy afterIP =
       moduleTranslation.getOpenMPBuilder()->createTarget(
-          ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), entryInfo,
+          ompLoc, isOffloadEntry, allocaIP, builder.saveIP(), info, entryInfo,
           defaultAttrs, runtimeAttrs, ifCond, kernelInput, genMapInfoCB, bodyCB,
-          argAccessorCB, dds, targetOp.getNowait());
+          argAccessorCB, customMapperCB, dds, targetOp.getNowait());
 
   if (failed(handleError(afterIP, opInst)))
     return failure();
@@ -4673,12 +4786,15 @@ convertHostOrTargetOperation(Operation *op, llvm::IRBuilderBase &builder,
       .Case([&](omp::TaskwaitOp op) {
         return convertOmpTaskwaitOp(op, builder, moduleTranslation);
       })
-      .Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareReductionOp,
+      .Case<omp::YieldOp, omp::TerminatorOp, omp::DeclareMapperOp,
+            omp::DeclareMapperInfoOp, omp::DeclareReductionOp,
             omp::CriticalDeclareOp>([](auto op) {
         // `yield` and `terminator` can be just omitted. The block structure
         // was created in the region that handles their parent operation.
         // `declare_reduction` will be used by reductions and is not
         // converted directly, skip it.
+        // `declare_mapper` and `declare_mapper.info` are handled whenever they
+        // are referred to through a `map` clause.
         // `critical.declare` is only used to declare names of critical
         // sections which will be used by `critical` ops and hence can be
         // ignored for lowering. The OpenMP IRBuilder will create unique
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index fd0283b856b6b..8445e609c2244 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -962,13 +962,18 @@ ModuleImport::getOrCreateNamelessSymbolName(llvm::GlobalVariable *globalVar) {
   return symbolRef;
 }
 
-LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
-  // Insert the global after the last one or at the start of the module.
+OpBuilder::InsertionGuard ModuleImport::setGlobalInsertionPoint() {
   OpBuilder::InsertionGuard guard(builder);
-  if (!aliasInsertionOp)
-    builder.setInsertionPointToStart(mlirModule.getBody());
+  if (globalInsertionOp)
+    builder.setInsertionPointAfter(globalInsertionOp);
   else
-    builder.setInsertionPointAfter(aliasInsertionOp);
+    builder.setInsertionPointToStart(mlirModule.getBody());
+  return guard;
+}
+
+LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
+  // Insert the alias after the last one or at the start of the module.
+  OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
 
   Type type = convertType(alias->getValueType());
   AliasOp aliasOp = builder.create<AliasOp>(
@@ -977,7 +982,7 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
       /*dso_local=*/alias->isDSOLocal(),
       /*thread_local=*/alias->isThreadLocal(),
       /*attrs=*/ArrayRef<NamedAttribute>());
-  aliasInsertionOp = aliasOp;
+  globalInsertionOp = aliasOp;
 
   clearRegionState();
   Block *block = builder.createBlock(&aliasOp.getInitializerRegion());
@@ -996,11 +1001,7 @@ LogicalResult ModuleImport::convertAlias(llvm::GlobalAlias *alias) {
 
 LogicalResult ModuleImport::convertGlobal(llvm::GlobalVariable *globalVar) {
   // Insert the global after the last one or at the start of the module.
-  OpBuilder::InsertionGuard guard(builder);
-  if (!globalInsertionOp)
-    builder.setInsertionPointToStart(mlirModule.getBody());
-  else
-    builder.setInsertionPointAfter(globalInsertionOp);
+  OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
 
   Attribute valueAttr;
   if (globalVar->hasInitializer())
@@ -1096,11 +1097,8 @@ ModuleImport::convertGlobalCtorsAndDtors(llvm::GlobalVariable *globalVar) {
     priorities.push_back(priority->getValue().getZExtValue());
   }
 
-  OpBuilder::InsertionGuard guard(builder);
-  if (!globalInsertionOp)
-    builder.setInsertionPointToStart(mlirModule.getBody());
-  else
-    builder.setInsertionPointAfter(globalInsertionOp);
+  // Insert the global after the last one or at the start of the module.
+  OpBuilder::InsertionGuard guard = setGlobalInsertionPoint();
 
   if (globalVar->getName() == getGlobalCtorsVarName()) {
     globalInsertionOp = builder.create<LLVM::GlobalCtorsOp>(
diff --git a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
index 6f1ed73e778b4..d69de998346b5 100644
--- a/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
+++ b/mlir/test/Conversion/OpenMPToLLVM/convert-to-llvmir.mlir
@@ -601,3 +601,16 @@ func.func @omp_taskloop(%arg0: index, %arg1 : memref<i32>) {
   }
   return
 }
+
+// -----
+
+// CHECK-LABEL: omp.declare_mapper @my_mapper : !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)> {
+omp.declare_mapper @my_mapper : !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)> {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  %1 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)>
+  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var%data"}
+  %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<"_QFdeclare_mapperTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) members(%2 : [0] : !llvm.ptr) -> !llvm.ptr {name = "var", partial_map = true}
+  // CHECK: omp.declare_mapper.info map_entries(%{{.*}}, %{{.*}} : !llvm.ptr, !llvm.ptr)
+  omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr)
+}
diff --git a/mlir/test/Dialect/Math/polynomial-approximation.mlir b/mlir/test/Dialect/Math/polynomial-approximation.mlir
index 81d071e6bbba3..bf7c4134af12e 100644
--- a/mlir/test/Dialect/Math/polynomial-approximation.mlir
+++ b/mlir/test/Dialect/Math/polynomial-approximation.mlir
@@ -81,6 +81,116 @@ func.func @erf_scalar(%arg0: f32) -> f32 {
   return %0 : f32
 }
 
+// CHECK-LABEL: func @erfc_scalar(
+// CHECK-SAME:    %[[val_arg0:.*]]: f32) -> f32 {
+// CHECK-DAG:     %[[c127_i32:.*]] = arith.constant 127 : i32
+// CHECK-DAG:     %[[c23_i32:.*]] = arith.constant 23 : i32
+// CHECK-DAG:     %[[cst:.*]] = arith.constant 1.270000e+02 : f32
+// CHECK-DAG:     %[[cst_0:.*]] = arith.constant -1.270000e+02 : f32
+// CHECK-DAG:     %[[cst_1:.*]] = arith.constant 8.880000e+01 : f32
+// CHECK-DAG:     %[[cst_2:.*]] = arith.constant -8.780000e+01 : f32
+// CHECK-DAG:     %[[cst_3:.*]] = arith.constant 0.166666657 : f32
+// CHECK-DAG:     %[[cst_4:.*]] = arith.constant 0.0416657962 : f32
+// CHECK-DAG:     %[[cst_5:.*]] = arith.constant 0.00833345205 : f32
+// CHECK-DAG:     %[[cst_6:.*]] = arith.constant 0.00139819994 : f32
+// CHECK-DAG:     %[[cst_7:.*]] = arith.constant 1.98756912E-4 : f32
+// CHECK-DAG:     %[[cst_8:.*]] = arith.constant 2.12194442E-4 : f32
+// CHECK-DAG:     %[[cst_9:.*]] = arith.constant -0.693359375 : f32
+// CHECK-DAG:     %[[cst_10:.*]] = arith.constant 1.44269502 : f32
+// CHECK-DAG:     %[[cst_11:.*]] = arith.constant 0.276978403 : f32
+// CHECK-DAG:     %[[cst_12:.*]] = arith.constant -0.0927639827 : f32
+// CHECK-DAG:     %[[cst_13:.*]] = arith.constant -0.166031361 : f32
+// CHECK-DAG:     %[[cst_14:.*]] = arith.constant 0.164055392 : f32
+// CHECK-DAG:     %[[cst_15:.*]] = arith.constant -0.0542046614 : f32
+// CHECK-DAG:     %[[cst_16:.*]] = arith.constant -8.059920e-03 : f32
+// CHECK-DAG:     %[[cst_17:.*]] = arith.constant 0.00863227434 : f32
+// CHECK-DAG:     %[[cst_18:.*]] = arith.constant 0.00131355342 : f32
+// CHECK-DAG:     %[[cst_19:.*]] = arith.constant -0.0012307521 : f32
+// CHECK-DAG:     %[[cst_20:.*]] = arith.constant -4.01139259E-4 : f32
+// CHECK-DAG:     %[[cst_true:.*]] = arith.constant true
+// CHECK-DAG:     %[[cst_21:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-DAG:     %[[cst_22:.*]] = arith.constant 1.000000e+00 : f32
+// CHECK-DAG:     %[[cst_23:.*]] = arith.constant 5.000000e-01 : f32
+// CHECK-DAG:     %[[cst_24:.*]] = arith.constant -4.000000e+00 : f32
+// CHECK-DAG:     %[[cst_25:.*]] = arith.constant -2.000000e+00 : f32
+// CHECK-DAG:     %[[cst_26:.*]] = arith.constant 2.000000e+00 : f32
+// CHECK-DAG:     %[[cst_27:.*]] = arith.constant 0x7F800000 : f32
+// CHECK-DAG:     %[[cst_28:.*]] = arith.constant 10.0546875 : f32
+// CHECK:         %[[val_2:.*]] = math.absf %[[val_arg0]] : f32
+// CHECK-NEXT:    %[[val_3:.*]] = arith.addf %[[val_2]], %[[cst_26]] : f32
+// CHECK-NEXT:    %[[val_4:.*]] = arith.divf %[[cst_22]], %[[val_3]] : f32
+// CHECK-NEXT:    %[[val_5:.*]] = math.fma %[[cst_24]], %[[val_4]], %[[cst_22]] : f32
+// CHECK-NEXT:    %[[val_6:.*]] = arith.addf %[[val_5]], %[[cst_22]] : f32
+// CHECK-NEXT:    %[[val_7:.*]] = math.fma %[[val_6]], %[[cst_25]], %[[val_2]] : f32
+// CHECK-NEXT:    %[[val_8:.*]] = arith.negf %[[val_2]] : f32
+// CHECK-NEXT:    %[[val_9:.*]] = math.fma %[[val_8]], %[[val_5]], %[[val_7]] : f32
+// CHECK-NEXT:    %[[val_10:.*]] = math.fma %[[val_4]], %[[val_9]], %[[val_5]] : f32
+// CHECK-NEXT:    %[[val_11:.*]] = math.fma %[[cst_20]], %[[val_10]], %[[cst_19]] : f32
+// CHECK-NEXT:    %[[val_12:.*]] = math.fma %[[val_11]], %[[val_10]], %[[cst_18]] : f32
+// CHECK-NEXT:    %[[val_13:.*]] = math.fma %[[val_12]], %[[val_10]], %[[cst_17]] : f32
+// CHECK-NEXT:    %[[val_14:.*]] = math.fma %[[val_13]], %[[val_10]], %[[cst_16]] : f32
+// CHECK-NEXT:    %[[val_15:.*]] = math.fma %[[val_14]], %[[val_10]], %[[cst_15]] : f32
+// CHECK-NEXT:    %[[val_16:.*]] = math.fma %[[val_15]], %[[val_10]], %[[cst_14]] : f32
+// CHECK-NEXT:    %[[val_17:.*]] = math.fma %[[val_16]], %[[val_10]], %[[cst_13]] : f32
+// CHECK-NEXT:    %[[val_18:.*]] = math.fma %[[val_17]], %[[val_10]], %[[cst_12]] : f32
+// CHECK-NEXT:    %[[val_19:.*]] = math.fma %[[val_18]], %[[val_10]], %[[cst_11]] : f32
+// CHECK-NEXT:    %[[val_20:.*]] = math.fma %[[cst_26]], %[[val_2]], %[[cst_22]] : f32
+// CHECK-NEXT:    %[[val_21:.*]] = arith.divf %[[cst_22]], %[[val_20]] : f32
+// CHECK-NEXT:    %[[val_22:.*]] = math.fma %[[val_19]], %[[val_21]], %[[val_21]] : f32
+// CHECK-NEXT:    %[[val_23:.*]] = arith.negf %[[val_2]] : f32
+// CHECK-NEXT:    %[[val_24:.*]] = math.fma %[[val_22]], %[[val_23]], %[[cst_23]] : f32
+// CHECK-NEXT:    %[[val_25:.*]] = arith.subf %[[val_19]], %[[val_22]] : f32
+// CHECK-NEXT:    %[[val_26:.*]] = math.fma %[[val_24]], %[[cst_26]], %[[val_25]] : f32
+// CHECK-NEXT:    %[[val_27:.*]] = math.fma %[[val_26]], %[[val_21]], %[[val_22]] : f32
+// CHECK-NEXT:    %[[val_28:.*]] = arith.mulf %[[val_2]], %[[val_2]] : f32
+// CHECK-NEXT:    %[[val_29:.*]] = arith.negf %[[val_28]] : f32
+// CHECK-NEXT:    %[[val_30:.*]] = arith.cmpf uge, %[[val_29]], %[[cst_2]] : f32
+// CHECK-NEXT:    %[[val_31:.*]] = arith.select %[[val_30]], %[[val_29]], %[[cst_2]] : f32
+// CHECK-NEXT:    %[[val_32:.*]] = arith.cmpf ule, %[[val_31]], %[[cst_1]] : f32
+// CHECK-NEXT:    %[[val_33:.*]] = arith.select %[[val_32]], %[[val_31]], %[[cst_1]] : f32
+// CHECK-NEXT:    %[[val_34:.*]] = math.fma %[[val_33]], %[[cst_10]], %[[cst_23]] : f32
+// CHECK-NEXT:    %[[val_35:.*]] = math.floor %[[val_34]] : f32
+// CHECK-NEXT:    %[[val_36:.*]] = arith.cmpf uge, %[[val_35]], %[[cst_0]] : f32
+// CHECK-NEXT:    %[[val_37:.*]] = arith.select %[[val_36]], %[[val_35]], %[[cst_0]] : f32
+// CHECK-NEXT:    %[[val_38:.*]] = arith.cmpf ule, %[[val_37]], %[[cst]] : f32
+// CHECK-NEXT:    %[[val_39:.*]] = arith.select %[[val_38]], %[[val_37]], %[[cst]] : f32
+// CHECK-NEXT:    %[[val_40:.*]] = math.fma %[[cst_9]], %[[val_39]], %[[val_33]] : f32
+// CHECK-NEXT:    %[[val_41:.*]] = math.fma %[[cst_8]], %[[val_39]], %[[val_40]] : f32
+// CHECK-NEXT:    %[[val_42:.*]] = math.fma %[[val_41]], %[[cst_7]], %[[cst_6]] : f32
+// CHECK-NEXT:    %[[val_43:.*]] = math.fma %[[val_42]], %[[val_41]], %[[cst_5]] : f32
+// CHECK-NEXT:    %[[val_44:.*]] = math.fma %[[val_43]], %[[val_41]], %[[cst_4]] : f32
+// CHECK-NEXT:    %[[val_45:.*]] = math.fma %[[val_44]], %[[val_41]], %[[cst_3]] : f32
+// CHECK-NEXT:    %[[val_46:.*]] = math.fma %[[val_45]], %[[val_41]], %[[cst_23]] : f32
+// CHECK-NEXT:    %[[val_47:.*]] = arith.mulf %[[val_41]], %[[val_41]] : f32
+// CHECK-NEXT:    %[[val_48:.*]] = math.fma %[[val_46]], %[[val_47]], %[[val_41]] : f32
+// CHECK-NEXT:    %[[val_49:.*]] = arith.addf %[[val_48]], %[[cst_22]] : f32
+// CHECK-NEXT:    %[[val_50:.*]] = arith.fptosi %[[val_39]] : f32 to i32
+// CHECK-NEXT:    %[[val_51:.*]] = arith.addi %[[val_50]], %[[c127_i32]] : i32
+// CHECK-NEXT:    %[[val_52:.*]] = arith.shli %[[val_51]], %[[c23_i32]] : i32
+// CHECK-NEXT:    %[[val_53:.*]] = arith.bitcast %[[val_52]] : i32 to f32
+// CHECK-NEXT:    %[[val_54:.*]] = arith.mulf %[[val_49]], %[[val_53]] : f32
+// CHECK-NEXT:    %[[val_55:.*]] = arith.negf %[[val_2]] : f32
+// CHECK-NEXT:    %[[val_56:.*]] = math.fma %[[val_55]], %[[val_2]], %[[val_28]] : f32
+// CHECK-NEXT:    %[[val_57:.*]] = arith.mulf %[[val_27]], %[[val_54]] : f32
+// CHECK-NEXT:    %[[val_58:.*]] = arith.mulf %[[val_57]], %[[val_56]] : f32
+// CHECK-NEXT:    %[[val_59:.*]] = math.fma %[[val_27]], %[[val_54]], %[[val_58]] : f32
+// CHECK-NEXT:    %[[val_60:.*]] = arith.cmpf olt, %[[val_2]], %[[cst_27]] : f32
+// CHECK-NEXT:    %[[val_61:.*]] = arith.xori %[[val_60]], %[[cst_true]] : i1
+// CHECK-NEXT:    %[[val_62:.*]] = arith.addf %[[val_arg0]], %[[val_arg0]] : f32
+// CHECK-NEXT:    %[[val_63:.*]] = arith.select %[[val_61]], %[[val_62]], %[[val_59]] : f32
+// CHECK-NEXT:    %[[val_64:.*]] = arith.cmpf ogt, %[[val_2]], %[[cst_28]] : f32
+// CHECK-NEXT:    %[[val_65:.*]] = arith.select %[[val_64]], %[[cst_21]], %[[val_63]] : f32
+// CHECK-NEXT:    %[[val_66:.*]] = arith.cmpf olt, %[[val_arg0]], %[[cst_21]] : f32
+// CHECK-NEXT:    %[[val_67:.*]] = arith.subf %[[cst_26]], %[[val_65]] : f32
+// CHECK-NEXT:    %[[val_68:.*]] = arith.select %[[val_66]], %[[val_67]], %[[val_65]] : f32
+// CHECK-NEXT:    return %[[val_68]] : f32
+// CHECK-NEXT: }
+
+func.func @erfc_scalar(%arg0: f32) -> f32 {
+  %0 = math.erfc %arg0 : f32
+  return %0 : f32
+}
+
 // CHECK-LABEL:   func @erf_vector(
 // CHECK-SAME:                     %[[arg0:.*]]: vector<8xf32>) -> vector<8xf32> {
 // CHECK:           %[[zero:.*]] = arith.constant dense<0.000000e+00> : vector<8xf32>
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 02b0af17564d4..d7f468bed3d3d 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -2842,3 +2842,20 @@ func.func @missing_workshare(%idx : index) {
   }
   return
 }
+
+// -----
+  // expected-error @below {{op expected terminator to be a DeclareMapperInfoOp}}
+  omp.declare_mapper @missing_declareMapperInfo : !llvm.struct<"mytype", (array<1024 x i32>)> {
+  ^bb0(%arg0: !llvm.ptr):
+    omp.terminator
+  }
+
+// -----
+llvm.func @invalid_mapper(%0 : !llvm.ptr) {
+  %1 = omp.map.info var_ptr(%0 : !llvm.ptr, !llvm.struct<"my_type", (i32)>) mapper(@my_mapper) map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+  // expected-error @below {{invalid mapper id}}
+  omp.target_data map_entries(%1 : !llvm.ptr) {
+    omp.terminator
+  }
+  llvm.return
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index aca63600876aa..e318afbebbf0c 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -879,6 +879,15 @@ cleanup {
   omp.yield
 }
 
+// CHECK: omp.declare_mapper @my_mapper : !llvm.struct<"my_type", (i32)>
+omp.declare_mapper @my_mapper : !llvm.struct<"my_type", (i32)> {
+^bb0(%arg: !llvm.ptr):
+  // CHECK: %[[DECL_MAP_INFO:.*]] = omp.map.info var_ptr(%{{.*}} : !llvm.ptr, !llvm.struct<"my_type", (i32)>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+  %decl_map_info = omp.map.info var_ptr(%arg : !llvm.ptr, !llvm.struct<"my_type", (i32)>) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = ""}
+  // CHECK: omp.declare_mapper.info map_entries(%[[DECL_MAP_INFO]] : !llvm.ptr)
+  omp.declare_mapper.info map_entries(%decl_map_info : !llvm.ptr)
+}
+
 // CHECK-LABEL: func @wsloop_reduction
 func.func @wsloop_reduction(%lb : index, %ub : index, %step : index) {
   %c1 = arith.constant 1 : i32
@@ -2537,13 +2546,13 @@ func.func @omp_targets_with_map_bounds(%arg0: !llvm.ptr, %arg1: !llvm.ptr) -> ()
   // CHECK: %[[C_12:.*]] = llvm.mlir.constant(2 : index) : i64
   // CHECK: %[[C_13:.*]] = llvm.mlir.constant(2 : index) : i64
   // CHECK: %[[BOUNDS1:.*]] = omp.map.bounds   lower_bound(%[[C_11]] : i64) upper_bound(%[[C_10]] : i64) stride(%[[C_12]] : i64) start_idx(%[[C_13]] : i64)
-  // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByCopy) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
+  // CHECK: %[[MAP1:.*]] = omp.map.info var_ptr(%[[ARG1]] : !llvm.ptr, !llvm.array<10 x i32>)   mapper(@my_mapper) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) bounds(%[[BOUNDS1]]) -> !llvm.ptr {name = ""}
     %6 = llvm.mlir.constant(9 : index) : i64
     %7 = llvm.mlir.constant(1 : index) : i64
     %8 = llvm.mlir.constant(2 : index) : i64
     %9 = llvm.mlir.constant(2 : index) : i64
     %10 = omp.map.bounds   lower_bound(%7 : i64) upper_bound(%6 : i64) stride(%8 : i64) start_idx(%9 : i64)
-    %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>)   map_clauses(exit_release_or_enter_alloc) capture(ByCopy) bounds(%10) -> !llvm.ptr {name = ""}
+    %mapv2 = omp.map.info var_ptr(%arg1 : !llvm.ptr, !llvm.array<10 x i32>)   mapper(@my_mapper) map_clauses(exit_release_or_enter_alloc) capture(ByCopy) bounds(%10) -> !llvm.ptr {name = ""}
 
     // CHECK: omp.target map_entries(%[[MAP0]] -> {{.*}}, %[[MAP1]] -> {{.*}} : !llvm.ptr, !llvm.ptr)
     omp.target map_entries(%mapv1 -> %arg2, %mapv2 -> %arg3 : !llvm.ptr, !llvm.ptr) {
diff --git a/mlir/test/IR/op-asm-interface.mlir b/mlir/test/IR/op-asm-interface.mlir
index a9c199e3dc973..086dc7da421c2 100644
--- a/mlir/test/IR/op-asm-interface.mlir
+++ b/mlir/test/IR/op-asm-interface.mlir
@@ -22,3 +22,63 @@ func.func @block_argument_name_from_op_asm_type_interface() {
   }
   return
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test OpAsmTypeInterface
+//===----------------------------------------------------------------------===//
+
+func.func @result_name_from_op_asm_type_interface_asmprinter() {
+  // CHECK-LABEL: @result_name_from_op_asm_type_interface_asmprinter
+  // CHECK: %op_asm_type_interface
+  %0 = "test.result_name_from_type_interface"() : () -> !test.op_asm_type_interface
+  return
+}
+
+// -----
+
+// i1 does not have OpAsmTypeInterface, should not get named.
+func.func @result_name_from_op_asm_type_interface_not_all() {
+  // CHECK-LABEL: @result_name_from_op_asm_type_interface_not_all
+  // CHECK-NOT: %op_asm_type_interface
+  // CHECK: %0:2
+  %0:2 = "test.result_name_from_type_interface"() : () -> (!test.op_asm_type_interface, i1)
+  return
+}
+
+// -----
+
+func.func @block_argument_name_from_op_asm_type_interface_asmprinter() {
+  // CHECK-LABEL: @block_argument_name_from_op_asm_type_interface_asmprinter
+  // CHECK: ^bb0(%op_asm_type_interface
+  test.block_argument_name_from_type_interface {
+    ^bb0(%arg0: !test.op_asm_type_interface):
+      "test.terminator"() : ()->()
+  }
+  return
+}
+
+// -----
+
+// CHECK: !op_asm_type_interface_type =
+!type = !test.op_asm_type_interface
+
+func.func @alias_from_op_asm_type_interface() {
+  %0 = "test.result_name_from_type"() : () -> !type
+  return
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// Test OpAsmAttrInterface
+//===----------------------------------------------------------------------===//
+
+// CHECK: #op_asm_attr_interface_test
+#attr = #test.op_asm_attr_interface<value = "test">
+
+func.func @test_op_asm_attr_interface() {
+  %1 = "test.result_name_from_type"() {attr = #attr} : () -> !test.op_asm_type_interface
+  return
+}
diff --git a/mlir/test/Target/LLVMIR/Import/alias.ll b/mlir/test/Target/LLVMIR/Import/alias.ll
index 3ab68a7d8fb81..23eaecb9c9fa7 100644
--- a/mlir/test/Target/LLVMIR/Import/alias.ll
+++ b/mlir/test/Target/LLVMIR/Import/alias.ll
@@ -68,14 +68,6 @@ entry:
 @a1 = private alias i32, ptr @g1
 @a2 = private alias ptr, ptr @a1
 
-; CHECK: llvm.mlir.alias private @a1 {dso_local} : i32 {
-; CHECK:   %[[ADDR:.*]] = llvm.mlir.addressof @g1 : !llvm.ptr
-; CHECK:   llvm.return %[[ADDR]] : !llvm.ptr
-; CHECK: }
-; CHECK: llvm.mlir.alias private @a2 {dso_local} : !llvm.ptr {
-; CHECK-NEXT:   %[[ADDR:.*]] = llvm.mlir.addressof @a1 : !llvm.ptr
-; CHECK-NEXT:   llvm.return %[[ADDR]] : !llvm.ptr
-; CHECK-NEXT: }
 
 ; CHECK: llvm.mlir.global internal constant @g2() {addr_space = 0 : i32, dso_local} : !llvm.ptr {
 ; CHECK-NEXT:   %[[ADDR:.*]] = llvm.mlir.addressof @a1 : !llvm.ptr
@@ -86,3 +78,13 @@ entry:
 ; CHECK-NEXT:   %[[ADDR:.*]] = llvm.mlir.addressof @a2 : !llvm.ptr
 ; CHECK-NEXT:   llvm.return %[[ADDR]] : !llvm.ptr
 ; CHECK-NEXT: }
+
+; CHECK: llvm.mlir.alias private @a1 {dso_local} : i32 {
+; CHECK-NEXT:   %[[ADDR:.*]] = llvm.mlir.addressof @g1 : !llvm.ptr
+; CHECK-NEXT:   llvm.return %[[ADDR]] : !llvm.ptr
+; CHECK-NEXT: }
+
+; CHECK: llvm.mlir.alias private @a2 {dso_local} : !llvm.ptr {
+; CHECK-NEXT:   %[[ADDR:.*]] = llvm.mlir.addressof @a1 : !llvm.ptr
+; CHECK-NEXT:   llvm.return %[[ADDR]] : !llvm.ptr
+; CHECK-NEXT: }
diff --git a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
index 7f21095763a39..02b84ff66a0d3 100644
--- a/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
+++ b/mlir/test/Target/LLVMIR/omptarget-llvm.mlir
@@ -485,3 +485,120 @@ llvm.func @_QPopenmp_target_data_update() {
 // CHECK:         call void @__tgt_target_data_update_mapper(ptr @2, i64 -1, i32 1, ptr %[[BASEPTRS_VAL_2]], ptr %[[PTRS_VAL_2]], ptr @{{.*}}, ptr @{{.*}}, ptr @{{.*}}, ptr null)
 
 // CHECK:         ret void
+
+// -----
+
+omp.declare_mapper @_QQFmy_testmy_mapper : !llvm.struct<"_QFmy_testTmy_type", (i32)> {
+^bb0(%arg0: !llvm.ptr):
+  %0 = llvm.mlir.constant(0 : i32) : i32
+  %1 = llvm.getelementptr %arg0[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
+  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "var%data"}
+  %3 = omp.map.info var_ptr(%arg0 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) map_clauses(tofrom) capture(ByRef) members(%2 : [0] : !llvm.ptr) -> !llvm.ptr {name = "var", partial_map = true}
+  omp.declare_mapper.info map_entries(%3, %2 : !llvm.ptr, !llvm.ptr)
+}
+
+llvm.func @_QPopenmp_target_data_mapper() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x !llvm.struct<"_QFmy_testTmy_type", (i32)> {bindc_name = "a"} : (i64) -> !llvm.ptr
+  %2 = omp.map.info var_ptr(%1 : !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>) mapper(@_QQFmy_testmy_mapper) map_clauses(tofrom) capture(ByRef) -> !llvm.ptr {name = "a"}
+  omp.target_data map_entries(%2 : !llvm.ptr) {
+    %3 = llvm.mlir.constant(10 : i32) : i32
+    %4 = llvm.getelementptr %1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<"_QFmy_testTmy_type", (i32)>
+    llvm.store %3, %4 : i32, !llvm.ptr
+    omp.terminator
+  }
+  llvm.return
+}
+
+// CHECK:         @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 4]
+// CHECK:         @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 3]
+// CHECK-LABEL: define void @_QPopenmp_target_data_mapper
+// CHECK:         %[[VAL_0:.*]] = alloca [1 x ptr], align 8
+// CHECK:         %[[VAL_1:.*]] = alloca [1 x ptr], align 8
+// CHECK:         %[[VAL_2:.*]] = alloca [1 x ptr], align 8
+// CHECK:         %[[VAL_3:.*]] = alloca %[[VAL_4:.*]], i64 1, align 8
+// CHECK:         br label %[[VAL_5:.*]]
+// CHECK:       entry:                                            ; preds = %[[VAL_6:.*]]
+// CHECK:         %[[VAL_7:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_3]], ptr %[[VAL_7]], align 8
+// CHECK:         %[[VAL_8:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
+// CHECK:         store ptr %[[VAL_3]], ptr %[[VAL_8]], align 8
+// CHECK:         %[[VAL_9:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_2]], i64 0, i64 0
+// CHECK:         store ptr @.omp_mapper._QQFmy_testmy_mapper, ptr %[[VAL_9]], align 8
+// CHECK:         %[[VAL_10:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         %[[VAL_11:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
+// CHECK:         call void @__tgt_target_data_begin_mapper(ptr @4, i64 -1, i32 1, ptr %[[VAL_10]], ptr %[[VAL_11]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr @.offload_mapnames, ptr %[[VAL_2]])
+// CHECK:         %[[VAL_12:.*]] = getelementptr %[[VAL_4]], ptr %[[VAL_3]], i32 0, i32 0
+// CHECK:         store i32 10, ptr %[[VAL_12]], align 4
+// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_0]], i32 0, i32 0
+// CHECK:         %[[VAL_14:.*]] = getelementptr inbounds [1 x ptr], ptr %[[VAL_1]], i32 0, i32 0
+// CHECK:         call void @__tgt_target_data_end_mapper(ptr @4, i64 -1, i32 1, ptr %[[VAL_13]], ptr %[[VAL_14]], ptr @.offload_sizes, ptr @.offload_maptypes, ptr @.offload_mapnames, ptr %[[VAL_2]])
+// CHECK:         ret void
+
+// CHECK-LABEL: define internal void @.omp_mapper._QQFmy_testmy_mapper
+// CHECK:       entry:
+// CHECK:         %[[VAL_15:.*]] = udiv exact i64 %[[VAL_16:.*]], 4
+// CHECK:         %[[VAL_17:.*]] = getelementptr %[[VAL_18:.*]], ptr %[[VAL_19:.*]], i64 %[[VAL_15]]
+// CHECK:         %[[VAL_20:.*]] = icmp sgt i64 %[[VAL_15]], 1
+// CHECK:         %[[VAL_21:.*]] = and i64 %[[VAL_22:.*]], 8
+// CHECK:         %[[VAL_23:.*]] = icmp ne ptr %[[VAL_24:.*]], %[[VAL_19]]
+// CHECK:         %[[VAL_25:.*]] = and i64 %[[VAL_22]], 16
+// CHECK:         %[[VAL_26:.*]] = icmp ne i64 %[[VAL_25]], 0
+// CHECK:         %[[VAL_27:.*]] = and i1 %[[VAL_23]], %[[VAL_26]]
+// CHECK:         %[[VAL_28:.*]] = or i1 %[[VAL_20]], %[[VAL_27]]
+// CHECK:         %[[VAL_29:.*]] = icmp eq i64 %[[VAL_21]], 0
+// CHECK:         %[[VAL_30:.*]] = and i1 %[[VAL_28]], %[[VAL_29]]
+// CHECK:         br i1 %[[VAL_30]], label %[[VAL_31:.*]], label %[[VAL_32:.*]]
+// CHECK:       .omp.array..init:                                 ; preds = %[[VAL_33:.*]]
+// CHECK:         %[[VAL_34:.*]] = mul nuw i64 %[[VAL_15]], 4
+// CHECK:         %[[VAL_35:.*]] = and i64 %[[VAL_22]], -4
+// CHECK:         %[[VAL_36:.*]] = or i64 %[[VAL_35]], 512
+// CHECK:         call void @__tgt_push_mapper_component(ptr %[[VAL_37:.*]], ptr %[[VAL_24]], ptr %[[VAL_19]], i64 %[[VAL_34]], i64 %[[VAL_36]], ptr %[[VAL_38:.*]])
+// CHECK:         br label %[[VAL_32]]
+// CHECK:       omp.arraymap.head:                                ; preds = %[[VAL_31]], %[[VAL_33]]
+// CHECK:         %[[VAL_39:.*]] = icmp eq ptr %[[VAL_19]], %[[VAL_17]]
+// CHECK:         br i1 %[[VAL_39]], label %[[VAL_40:.*]], label %[[VAL_41:.*]]
+// CHECK:       omp.arraymap.body:                                ; preds = %[[VAL_42:.*]], %[[VAL_32]]
+// CHECK:         %[[VAL_43:.*]] = phi ptr [ %[[VAL_19]], %[[VAL_32]] ], [ %[[VAL_44:.*]], %[[VAL_42]] ]
+// CHECK:         %[[VAL_45:.*]] = getelementptr %[[VAL_18]], ptr %[[VAL_43]], i32 0, i32 0
+// CHECK:         %[[VAL_46:.*]] = call i64 @__tgt_mapper_num_components(ptr %[[VAL_37]])
+// CHECK:         %[[VAL_47:.*]] = shl i64 %[[VAL_46]], 48
+// CHECK:         %[[VAL_48:.*]] = add nuw i64 3, %[[VAL_47]]
+// CHECK:         %[[VAL_49:.*]] = and i64 %[[VAL_22]], 3
+// CHECK:         %[[VAL_50:.*]] = icmp eq i64 %[[VAL_49]], 0
+// CHECK:         br i1 %[[VAL_50]], label %[[VAL_51:.*]], label %[[VAL_52:.*]]
+// CHECK:       omp.type.alloc:                                   ; preds = %[[VAL_41]]
+// CHECK:         %[[VAL_53:.*]] = and i64 %[[VAL_48]], -4
+// CHECK:         br label %[[VAL_42]]
+// CHECK:       omp.type.alloc.else:                              ; preds = %[[VAL_41]]
+// CHECK:         %[[VAL_54:.*]] = icmp eq i64 %[[VAL_49]], 1
+// CHECK:         br i1 %[[VAL_54]], label %[[VAL_55:.*]], label %[[VAL_56:.*]]
+// CHECK:       omp.type.to:                                      ; preds = %[[VAL_52]]
+// CHECK:         %[[VAL_57:.*]] = and i64 %[[VAL_48]], -3
+// CHECK:         br label %[[VAL_42]]
+// CHECK:       omp.type.to.else:                                 ; preds = %[[VAL_52]]
+// CHECK:         %[[VAL_58:.*]] = icmp eq i64 %[[VAL_49]], 2
+// CHECK:         br i1 %[[VAL_58]], label %[[VAL_59:.*]], label %[[VAL_42]]
+// CHECK:       omp.type.from:                                    ; preds = %[[VAL_56]]
+// CHECK:         %[[VAL_60:.*]] = and i64 %[[VAL_48]], -2
+// CHECK:         br label %[[VAL_42]]
+// CHECK:       omp.type.end:                                     ; preds = %[[VAL_59]], %[[VAL_56]], %[[VAL_55]], %[[VAL_51]]
+// CHECK:         %[[VAL_61:.*]] = phi i64 [ %[[VAL_53]], %[[VAL_51]] ], [ %[[VAL_57]], %[[VAL_55]] ], [ %[[VAL_60]], %[[VAL_59]] ], [ %[[VAL_48]], %[[VAL_56]] ]
+// CHECK:         call void @__tgt_push_mapper_component(ptr %[[VAL_37]], ptr %[[VAL_43]], ptr %[[VAL_45]], i64 4, i64 %[[VAL_61]], ptr @2)
+// CHECK:         %[[VAL_44]] = getelementptr %[[VAL_18]], ptr %[[VAL_43]], i32 1
+// CHECK:         %[[VAL_62:.*]] = icmp eq ptr %[[VAL_44]], %[[VAL_17]]
+// CHECK:         br i1 %[[VAL_62]], label %[[VAL_63:.*]], label %[[VAL_41]]
+// CHECK:       omp.arraymap.exit:                                ; preds = %[[VAL_42]]
+// CHECK:         %[[VAL_64:.*]] = icmp sgt i64 %[[VAL_15]], 1
+// CHECK:         %[[VAL_65:.*]] = and i64 %[[VAL_22]], 8
+// CHECK:         %[[VAL_66:.*]] = icmp ne i64 %[[VAL_65]], 0
+// CHECK:         %[[VAL_67:.*]] = and i1 %[[VAL_64]], %[[VAL_66]]
+// CHECK:         br i1 %[[VAL_67]], label %[[VAL_68:.*]], label %[[VAL_40]]
+// CHECK:       .omp.array..del:                                  ; preds = %[[VAL_63]]
+// CHECK:         %[[VAL_69:.*]] = mul nuw i64 %[[VAL_15]], 4
+// CHECK:         %[[VAL_70:.*]] = and i64 %[[VAL_22]], -4
+// CHECK:         %[[VAL_71:.*]] = or i64 %[[VAL_70]], 512
+// CHECK:         call void @__tgt_push_mapper_component(ptr %[[VAL_37]], ptr %[[VAL_24]], ptr %[[VAL_19]], i64 %[[VAL_69]], i64 %[[VAL_71]], ptr %[[VAL_38]])
+// CHECK:         br label %[[VAL_40]]
+// CHECK:       omp.done:                                         ; preds = %[[VAL_68]], %[[VAL_63]], %[[VAL_32]]
+// CHECK:         ret void
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index eac28c57e2ab4..84a30277e63da 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -851,12 +851,23 @@ llvm.func @rocdl.make.buffer.rsrc(%ptr : !llvm.ptr,
                                   %numRecords : i32,
                                   %flags : i32) -> !llvm.ptr<8> {
   // CHECK-LABEL: rocdl.make.buffer.rsrc
-  // CHECK: %[[rsrc:.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p0(ptr %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: %[[rsrc:.*]] = call ptr addrspace(8) @llvm.amdgcn.make.buffer.rsrc.p8.p0(ptr %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
   // CHECK: ret ptr addrspace(8) %[[rsrc]]
   %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : !llvm.ptr to !llvm.ptr<8>
   llvm.return %rsrc : !llvm.ptr<8>
 }
 
+llvm.func @rocdl.make.buffer.rsrc.p7.p1(%ptr : !llvm.ptr<1>,
+                                  %stride : i16,
+                                  %numRecords : i32,
+                                  %flags : i32) -> !llvm.ptr<7> {
+  // CHECK-LABEL: rocdl.make.buffer.rsrc.p7.p1
+  // CHECK: %[[rsrc:.*]] = call ptr addrspace(7) @llvm.amdgcn.make.buffer.rsrc.p7.p1(ptr addrspace(1) %{{.*}}, i16 %{{.*}}, i32 %{{.*}}, i32 %{{.*}})
+  // CHECK: ret ptr addrspace(7) %[[rsrc]]
+  %rsrc = rocdl.make.buffer.rsrc %ptr, %stride, %numRecords, %flags : <1> to <7>
+  llvm.return %rsrc : !llvm.ptr<7>
+}
+
 llvm.func @rocdl.wmma.fp8(%arg0 : vector<2 x i32>, %arg1 : vector<8xf32>) -> vector<8xf32> {
   // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> %{{.*}}, <2 x i32> %{{.*}}, <8 x float> %{{.*}})
   %r0 = rocdl.wmma.f32.16x16x16.fp8_fp8 %arg0, %arg0, %arg1: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
index 0fd272f85d39b..4b809c1c0a765 100644
--- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
@@ -395,4 +395,14 @@ def TestCustomLocationAttr : Test_LocAttr<"TestCustomLocation"> {
   let assemblyFormat = "`<` $file `*` $line `>`";
 }
 
+// Test OpAsmAttrInterface.
+def TestOpAsmAttrInterfaceAttr : Test_Attr<"TestOpAsmAttrInterface",
+    [DeclareAttrInterfaceMethods<OpAsmAttrInterface, ["getAlias"]>]> {
+  let mnemonic = "op_asm_attr_interface";
+  let parameters = (ins "mlir::StringAttr":$value);
+  let assemblyFormat = [{
+    `<` struct(params) `>`
+  }];
+}
+
 #endif // TEST_ATTRDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index e09ea10906164..7c467308386f1 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -67,7 +67,7 @@ void CompoundAAttr::print(AsmPrinter &printer) const {
 //===----------------------------------------------------------------------===//
 
 Attribute TestDecimalShapeAttr::parse(AsmParser &parser, Type type) {
-  if (parser.parseLess()){
+  if (parser.parseLess()) {
     return Attribute();
   }
   SmallVector<int64_t> shape;
@@ -316,6 +316,17 @@ static ParseResult parseCustomFloatAttr(AsmParser &p, StringAttr &typeStrAttr,
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// TestOpAsmAttrInterfaceAttr
+//===----------------------------------------------------------------------===//
+
+::mlir::OpAsmDialectInterface::AliasResult
+TestOpAsmAttrInterfaceAttr::getAlias(::llvm::raw_ostream &os) const {
+  os << "op_asm_attr_interface_";
+  os << getValue().getValue();
+  return ::mlir::OpAsmDialectInterface::AliasResult::FinalAlias;
+}
+
 //===----------------------------------------------------------------------===//
 // Tablegen Generated Definitions
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 2aa0658ab0e5d..cdc1237ec8c5a 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -955,6 +955,25 @@ def BlockArgumentNameFromTypeOp
   let assemblyFormat = "regions attr-dict-with-keyword";
 }
 
+// This is used to test OpAsmTypeInterface::getAsmName's integration with AsmPrinter
+// for op result name when OpAsmOpInterface::getAsmResultNames is the default implementation
+// i.e. does nothing.
+def ResultNameFromTypeInterfaceOp
+ : TEST_Op<"result_name_from_type_interface",
+           [OpAsmOpInterface]> {
+  let results = (outs Variadic<AnyType>:$r);
+}
+
+// This is used to test OpAsmTypeInterface::getAsmName's integration with AsmPrinter
+// for block argument name when OpAsmOpInterface::getAsmBlockArgumentNames is the default implementation
+// i.e. does nothing.
+def BlockArgumentNameFromTypeInterfaceOp
+  : TEST_Op<"block_argument_name_from_type_interface",
+      [OpAsmOpInterface]> {
+  let regions = (region AnyRegion:$body);
+  let assemblyFormat = "regions attr-dict-with-keyword";
+}
+
 // This is used to test the OpAsmOpInterface::getDefaultDialect() feature:
 // operations nested in a region under this op will drop the "test." dialect
 // prefix.
diff --git a/mlir/test/lib/Dialect/Test/TestTypeDefs.td b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
index 6335701786ecc..c048f8b654ec2 100644
--- a/mlir/test/lib/Dialect/Test/TestTypeDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestTypeDefs.td
@@ -399,7 +399,7 @@ def TestTypeVerification : Test_Type<"TestTypeVerification"> {
 }
 
 def TestTypeOpAsmTypeInterface : Test_Type<"TestTypeOpAsmTypeInterface",
-    [DeclareTypeInterfaceMethods<OpAsmTypeInterface, ["getAsmName"]>]> {
+    [DeclareTypeInterfaceMethods<OpAsmTypeInterface, ["getAsmName", "getAlias"]>]> {
   let mnemonic = "op_asm_type_interface";
 }
 
diff --git a/mlir/test/lib/Dialect/Test/TestTypes.cpp b/mlir/test/lib/Dialect/Test/TestTypes.cpp
index 1ae7ac472d989..0c237440834ef 100644
--- a/mlir/test/lib/Dialect/Test/TestTypes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestTypes.cpp
@@ -537,3 +537,9 @@ void TestTypeOpAsmTypeInterfaceType::getAsmName(
     OpAsmSetNameFn setNameFn) const {
   setNameFn("op_asm_type_interface");
 }
+
+::mlir::OpAsmDialectInterface::AliasResult
+TestTypeOpAsmTypeInterfaceType::getAlias(::llvm::raw_ostream &os) const {
+  os << "op_asm_type_interface_type";
+  return ::mlir::OpAsmDialectInterface::AliasResult::FinalAlias;
+}
diff --git a/mlir/test/mlir-runner/math-polynomial-approx.mlir b/mlir/test/mlir-runner/math-polynomial-approx.mlir
index 148ef25cead62..6ed03916f1e15 100644
--- a/mlir/test/mlir-runner/math-polynomial-approx.mlir
+++ b/mlir/test/mlir-runner/math-polynomial-approx.mlir
@@ -273,6 +273,77 @@ func.func @erf() {
   return
 }
 
+// -------------------------------------------------------------------------- //
+// Erfc.
+// -------------------------------------------------------------------------- //
+func.func @erfc_f32(%a : f32) {
+  %r = math.erfc %a : f32
+  vector.print %r : f32
+  return
+}
+
+func.func @erfc_4xf32(%a : vector<4xf32>) {
+  %r = math.erfc %a : vector<4xf32>
+  vector.print %r : vector<4xf32>
+  return
+}
+
+func.func @erfc() {
+  // CHECK: 1.00027
+  %val1 = arith.constant -2.431864e-4 : f32
+  call @erfc_f32(%val1) : (f32) -> ()
+
+  // CHECK: 0.257905
+  %val2 = arith.constant 0.79999 : f32
+  call @erfc_f32(%val2) : (f32) -> ()
+
+  // CHECK: 0.257899
+  %val3 = arith.constant 0.8 : f32
+  call @erfc_f32(%val3) : (f32) -> ()
+
+  // CHECK: 0.00467794
+  %val4 = arith.constant 1.99999 : f32
+  call @erfc_f32(%val4) : (f32) -> ()
+
+  // CHECK: 0.00467774
+  %val5 = arith.constant 2.0 : f32
+  call @erfc_f32(%val5) : (f32) -> ()
+
+  // CHECK: 1.13736e-07
+  %val6 = arith.constant 3.74999 : f32
+  call @erfc_f32(%val6) : (f32) -> ()
+
+  // CHECK: 1.13727e-07
+  %val7 = arith.constant 3.75 : f32
+  call @erfc_f32(%val7) : (f32) -> ()
+
+  // CHECK: 2
+  %negativeInf = arith.constant 0xff800000 : f32
+  call @erfc_f32(%negativeInf) : (f32) -> ()
+
+  // CHECK: 2, 2, 1.91376, 1.73145
+  %vecVals1 = arith.constant dense<[-3.4028235e+38, -4.54318, -1.2130899, -7.8234202e-01]> : vector<4xf32>
+  call @erfc_4xf32(%vecVals1) : (vector<4xf32>) -> ()
+
+  // CHECK: 1, 1, 1, 0.878681
+  %vecVals2 = arith.constant dense<[-1.1754944e-38, 0.0, 1.1754944e-38, 1.0793410e-01]> : vector<4xf32>
+  call @erfc_4xf32(%vecVals2) : (vector<4xf32>) -> ()
+
+  // CHECK: 0.0805235, 0.000931045, 6.40418e-08, 0
+  %vecVals3 = arith.constant dense<[1.23578, 2.34093, 3.82342, 3.4028235e+38]> : vector<4xf32>
+  call @erfc_4xf32(%vecVals3) : (vector<4xf32>) -> ()
+
+  // CHECK: 0
+  %inf = arith.constant 0x7f800000 : f32
+  call @erfc_f32(%inf) : (f32) -> ()
+
+  // CHECK: nan
+  %nan = arith.constant 0x7fc00000 : f32
+  call @erfc_f32(%nan) : (f32) -> ()
+
+  return
+}
+
 // -------------------------------------------------------------------------- //
 // Exp.
 // -------------------------------------------------------------------------- //
@@ -772,6 +843,7 @@ func.func @main() {
   call @log2(): () -> ()
   call @log1p(): () -> ()
   call @erf(): () -> ()
+  call @erfc(): () -> ()
   call @exp(): () -> ()
   call @expm1(): () -> ()
   call @sin(): () -> ()
diff --git a/mlir/utils/vim/syntax/mlir.vim b/mlir/utils/vim/syntax/mlir.vim
index 7989032eada88..070d81658ca3d 100644
--- a/mlir/utils/vim/syntax/mlir.vim
+++ b/mlir/utils/vim/syntax/mlir.vim
@@ -44,6 +44,7 @@ syn keyword mlirOps view
 
 " Math ops.
 syn match mlirOps /\<math\.erf\>/
+syn match mlirOps /\<math\.erfc\>/
 
 " Affine ops.
 syn match mlirOps /\<affine\.apply\>/
diff --git a/offload/libomptarget/OpenMP/Mapping.cpp b/offload/libomptarget/OpenMP/Mapping.cpp
index 4b78ed3360a26..14f5e7dc9d19f 100644
--- a/offload/libomptarget/OpenMP/Mapping.cpp
+++ b/offload/libomptarget/OpenMP/Mapping.cpp
@@ -141,48 +141,45 @@ LookupResult MappingInfoTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
   if (HDTTMap->empty())
     return LR;
 
+  // HDTTMap is std::set, ordered by HstPtrBegin.
+  // Upper is the first element whose HstPtrBegin > HP.
   auto Upper = HDTTMap->upper_bound(HP);
 
   if (Size == 0) {
-    // specification v5.1 Pointer Initialization for Device Data Environments
-    // upper_bound satisfies
-    //   std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin
+    // HP satisfies
+    //   std::prev(Upper)->HDTT.HstPtrBegin <= HP < Upper->HDTT.HstPtrBegin
     if (Upper != HDTTMap->begin()) {
       LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR);
-      // the left side of extended address range is satisfied.
-      // hp >= LR.TPR.getEntry()->HstPtrBegin || hp >=
-      // LR.TPR.getEntry()->HstPtrBase
-      LR.Flags.IsContained = HP < LR.TPR.getEntry()->HstPtrEnd ||
-                             HP < LR.TPR.getEntry()->HstPtrBase;
+      // We know that HP >= LR.TPR.getEntry()->HstPtrBegin
+      LR.Flags.IsContained = HP < LR.TPR.getEntry()->HstPtrEnd;
     }
 
     if (!LR.Flags.IsContained && Upper != HDTTMap->end()) {
       LR.TPR.setEntry(Upper->HDTT, OwnedTPR);
-      // the right side of extended address range is satisfied.
-      // hp < LR.TPR.getEntry()->HstPtrEnd || hp < LR.TPR.getEntry()->HstPtrBase
+      // This is a special case: HP is not really contained in the mapped
+      // address range, but it's contained in the extended address range,
+      // which suffices to get the mapping of the base pointer.
+      // We know that HP < LR.TPR.getEntry()->HstPtrBegin
       LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBase;
     }
   } else {
-    // check the left bin
     if (Upper != HDTTMap->begin()) {
       LR.TPR.setEntry(std::prev(Upper)->HDTT, OwnedTPR);
-      // Is it contained?
-      LR.Flags.IsContained = HP >= LR.TPR.getEntry()->HstPtrBegin &&
-                             HP < LR.TPR.getEntry()->HstPtrEnd &&
+      // We know that HP >= LR.TPR.getEntry()->HstPtrBegin
+      LR.Flags.IsContained = HP < LR.TPR.getEntry()->HstPtrEnd &&
                              (HP + Size) <= LR.TPR.getEntry()->HstPtrEnd;
-      // Does it extend beyond the mapped region?
+      // Does it extend beyond the mapped address range?
       LR.Flags.ExtendsAfter = HP < LR.TPR.getEntry()->HstPtrEnd &&
                               (HP + Size) > LR.TPR.getEntry()->HstPtrEnd;
     }
 
-    // check the right bin
     if (!(LR.Flags.IsContained || LR.Flags.ExtendsAfter) &&
         Upper != HDTTMap->end()) {
       LR.TPR.setEntry(Upper->HDTT, OwnedTPR);
-      // Does it extend into an already mapped region?
-      LR.Flags.ExtendsBefore = HP < LR.TPR.getEntry()->HstPtrBegin &&
-                               (HP + Size) > LR.TPR.getEntry()->HstPtrBegin;
-      // Does it extend beyond the mapped region?
+      // Does it extend into an already mapped address range?
+      // We know that HP < LR.TPR.getEntry()->HstPtrBegin
+      LR.Flags.ExtendsBefore = (HP + Size) > LR.TPR.getEntry()->HstPtrBegin;
+      // Does it extend beyond the mapped address range?
       LR.Flags.ExtendsAfter = HP < LR.TPR.getEntry()->HstPtrEnd &&
                               (HP + Size) > LR.TPR.getEntry()->HstPtrEnd;
     }
diff --git a/offload/test/offloading/fortran/target-custom-mapper.f90 b/offload/test/offloading/fortran/target-custom-mapper.f90
new file mode 100644
index 0000000000000..9c527861c87b5
--- /dev/null
+++ b/offload/test/offloading/fortran/target-custom-mapper.f90
@@ -0,0 +1,53 @@
+! Offloading test checking lowering of arrays with dynamic extents.
+! REQUIRES: flang, amdgpu
+
+! RUN: %libomptarget-compile-fortran-run-and-check-generic
+
+program test_openmp_mapper
+   implicit none
+   integer, parameter :: n = 1024
+   type :: mytype
+      integer :: data(n)
+   end type mytype
+
+   type :: mytype2
+      type(mytype) :: my_data
+   end type mytype2
+
+   ! Declare custom mappers for the derived type `mytype`
+   !$omp declare mapper(my_mapper1 : mytype :: t) map(to: t%data(1 : n))
+
+   ! Declare custom mappers for the derived type `mytype2`
+   !$omp declare mapper(my_mapper2 : mytype2 :: t) map(mapper(my_mapper1): t%my_data)
+
+   type(mytype2) :: obj
+   integer :: i, sum_host, sum_device
+
+   ! Initialize the host data
+   do i = 1, n
+      obj%my_data%data(i) = 1
+   end do
+
+   ! Compute the sum on the host for verification
+   sum_host = sum(obj%my_data%data)
+
+   ! Offload computation to the device using the named mapper `my_mapper2`
+   sum_device = 0
+   !$omp target map(tofrom: sum_device) map(mapper(my_mapper2) : obj)
+   do i = 1, n
+      sum_device = sum_device + obj%my_data%data(i)
+   end do
+   !$omp end target
+
+   ! Check results
+   print *, "Sum on host:    ", sum_host
+   print *, "Sum on device:  ", sum_device
+
+   if (sum_device == sum_host) then
+      print *, "Test passed!"
+   else
+      print *, "Test failed!"
+   end if
+ end program test_openmp_mapper
+
+! CHECK:  Test passed!
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 53aca8ab042ad..05385ba491525 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -96,6 +96,14 @@ exports_files(glob(["include/**/*.td"]))
 gentbl_cc_library(
     name = "OpAsmInterfaceIncGen",
     tbl_outs = [
+        (
+            ["-gen-attr-interface-decls"],
+            "include/mlir/IR/OpAsmAttrInterface.h.inc",
+        ),
+        (
+            ["-gen-attr-interface-defs"],
+            "include/mlir/IR/OpAsmAttrInterface.cpp.inc",
+        ),
         (
             ["-gen-op-interface-decls"],
             "include/mlir/IR/OpAsmOpInterface.h.inc",
@@ -1729,11 +1737,17 @@ gentbl_cc_library(
     name = "EmitCOpsIncGen",
     tbl_outs = [
         (
-            ["-gen-dialect-decls"],
+            [
+                "-gen-dialect-decls",
+                "-dialect=emitc",
+            ],
             "include/mlir/Dialect/EmitC/IR/EmitCDialect.h.inc",
         ),
         (
-            ["-gen-dialect-defs"],
+            [
+                "-gen-dialect-defs",
+                "-dialect=emitc",
+            ],
             "include/mlir/Dialect/EmitC/IR/EmitCDialect.cpp.inc",
         ),
         (
@@ -11533,34 +11547,20 @@ cc_library(
 
 cc_library(
     name = "LinalgInterfaces",
-    srcs = [
-        "include/mlir/Dialect/Linalg/IR/Linalg.h",
-        "lib/Dialect/Linalg/IR/LinalgInterfaces.cpp",
-    ],
+    # Note: LinalgInterfaces.cpp is part of LinalgDialect, not this target.
+    # This allows TensorDialect to use the header-only RelayOpInterface without
+    # an implicit dependency on the LinalgDialect.
     hdrs = ["include/mlir/Dialect/Linalg/IR/LinalgInterfaces.h"],
     includes = ["include"],
     deps = [
-        ":AffineDialect",
-        ":ArithDialect",
-        ":ArithUtils",
-        ":BytecodeOpInterface",
-        ":ComplexDialect",
-        ":ControlFlowInterfaces",
-        ":CopyOpInterface",
         ":DestinationStyleOpInterface",
         ":DialectUtils",
         ":IR",
         ":InferTypeOpInterface",
-        ":LinalgEnumsIncGen",
         ":LinalgInterfacesIncGen",
-        ":LinalgOpsIncGen",
-        ":LinalgRelayoutOpsIncGen",
         ":LinalgStructuredOpsIncGen",
-        ":SideEffectInterfaces",
         ":Support",
-        ":TilingInterface",
         ":ViewLikeInterface",
-        "//llvm:Support",
     ],
 )
 
@@ -11568,6 +11568,7 @@ cc_library(
     name = "LinalgDialect",
     srcs = [
         "lib/Dialect/Linalg/IR/LinalgDialect.cpp",
+        "lib/Dialect/Linalg/IR/LinalgInterfaces.cpp",
         "lib/Dialect/Linalg/IR/LinalgOps.cpp",
         "lib/Dialect/Linalg/IR/ValueBoundsOpInterfaceImpl.cpp",
     ],
@@ -11594,6 +11595,7 @@ cc_library(
         ":InliningUtils",
         ":LinalgEnumsIncGen",
         ":LinalgInterfaces",
+        ":LinalgInterfacesIncGen",
         ":LinalgNamedStructuredOpsYamlIncGen",
         ":LinalgOpsIncGen",
         ":LinalgRelayoutOpsIncGen",
diff --git a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
index d0c9f56f81cb9..a55c6f50118dc 100644
--- a/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/unittests/BUILD.bazel
@@ -176,7 +176,6 @@ cc_test(
         "//mlir:ArithDialect",
         "//mlir:FuncDialect",
         "//mlir:IR",
-        "//mlir:LinalgDialect",
         "//mlir:Parser",
         "//mlir:SCFDialect",
         "//mlir:SideEffectInterfaces",
@@ -212,7 +211,6 @@ cc_test(
         "//llvm:Support",
         "//llvm:TestingSupport",
         "//mlir:IR",
-        "//mlir:LinalgDialect",
         "//mlir:SPIRVBinaryUtils",
         "//mlir:SPIRVDeserialization",
         "//mlir:SPIRVDialect",