diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 49753c0746cbe..2d366028c2337 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -14,8 +14,6 @@ on:
       # do this is that it allows us to take advantage of concurrency groups
       # to cancel in progress CI jobs whenever the PR is closed.
       - closed
-    paths:
-      - .github/workflows/premerge.yaml
   push:
     branches:
       - 'main'
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index bc45caf3ec8b7..1e427b2df11cf 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -498,6 +498,11 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) {
     if (!IslandOffset)
       return;
 
+    // Print label if it exists at this offset.
+    if (const BinaryData *BD =
+            BC.getBinaryDataAtAddress(getAddress() + *IslandOffset))
+      OS << BD->getName() << ":\n";
+
     const size_t IslandSize = getSizeOfDataInCodeAt(*IslandOffset);
     BC.printData(OS, BC.extractData(getAddress() + *IslandOffset, IslandSize),
                  *IslandOffset);
@@ -1066,7 +1071,7 @@ size_t BinaryFunction::getSizeOfDataInCodeAt(uint64_t Offset) const {
   auto Iter = Islands->CodeOffsets.upper_bound(Offset);
   if (Iter != Islands->CodeOffsets.end())
     return *Iter - Offset;
-  return getSize() - Offset;
+  return getMaxSize() - Offset;
 }
 
 std::optional<uint64_t>
diff --git a/bolt/test/AArch64/data-in-code.s b/bolt/test/AArch64/data-in-code.s
index 8d3179a0c3350..1df5d4568542f 100644
--- a/bolt/test/AArch64/data-in-code.s
+++ b/bolt/test/AArch64/data-in-code.s
@@ -7,7 +7,8 @@
 ## Check disassembly of BOLT input.
 # RUN: llvm-objdump %t.exe -d | FileCheck %s
 
-# RUN: llvm-bolt %t.exe -o %t.bolt --print-disasm | FileCheck %s
+# RUN: llvm-bolt %t.exe -o %t.bolt --print-disasm \
+# RUN:   | FileCheck %s --check-prefixes CHECK,CHECK-BOLT-ONLY
 
 .text
 .balign 4
@@ -16,16 +17,21 @@
 .type _start, %function
 _start:
   mov x0, #0x0
+  ldr x1, .L1
   .word 0x4f82e010
   ret
+.size _start, .-_start
+.L1:
   .byte 0x0, 0xff, 0x42
 # CHECK-LABEL: _start
 # CHECK:        mov x0, #0x0
+# CHECK-NEXT:   ldr x1
+# CHECK-BOLT-ONLY-SAME: ISLANDat[[ADDR:]]
 # CHECK-NEXT:   .word 0x4f82e010
 # CHECK-NEXT:   ret
+# CHECK-BOLT-ONLY-NEXT: ISLANDat[[ADDR]]
 # CHECK-NEXT:   .short 0xff00
 # CHECK-NEXT:   .byte 0x42
-.size _start, .-_start
 
 ## Force relocation mode.
   .reloc 0, R_AARCH64_NONE
diff --git a/clang-tools-extra/clangd/unittests/ASTTests.cpp b/clang-tools-extra/clangd/unittests/ASTTests.cpp
index 32c8e8a63a215..d0bc3c4d7db98 100644
--- a/clang-tools-extra/clangd/unittests/ASTTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ASTTests.cpp
@@ -329,7 +329,7 @@ TEST(ClangdAST, GetContainedAutoParamType) {
        auto &&d,
        auto *&e,
        auto (*f)(int)
-    ){};
+    ){ return 0; };
 
     int withoutAuto(
       int a,
@@ -338,7 +338,7 @@ TEST(ClangdAST, GetContainedAutoParamType) {
       int &&d,
       int *&e,
       int (*f)(int)
-    ){};
+    ){ return 0; };
   )cpp");
   TU.ExtraArgs.push_back("-std=c++20");
   auto AST = TU.build();
diff --git a/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp b/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp
index 4276a44275f53..282859c51a66f 100644
--- a/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindSymbolsTests.cpp
@@ -113,7 +113,7 @@ TEST(WorkspaceSymbols, Unnamed) {
 TEST(WorkspaceSymbols, InMainFile) {
   TestTU TU;
   TU.Code = R"cpp(
-      int test() {}
+      int test() { return 0; }
       static void test2() {}
       )cpp";
   EXPECT_THAT(getSymbols(TU, "test"),
@@ -537,12 +537,14 @@ TEST(DocumentSymbols, InHeaderFile) {
   TestTU TU;
   TU.AdditionalFiles["bar.h"] = R"cpp(
       int foo() {
+        return 0;
       }
       )cpp";
   TU.Code = R"cpp(
       int i; // declaration to finish preamble
       #include "bar.h"
       int test() {
+        return 0;
       }
       )cpp";
   EXPECT_THAT(getSymbols(TU.build()),
@@ -780,7 +782,7 @@ TEST(DocumentSymbols, FuncTemplates) {
   TestTU TU;
   Annotations Source(R"cpp(
     template <class T>
-    T foo() {}
+    T foo() { return T{}; }
 
     auto x = foo<int>();
     auto y = foo<double>();
diff --git a/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp b/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
index 6ee641caeefe3..f9752d5d44f97 100644
--- a/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
+++ b/clang-tools-extra/clangd/unittests/ParsedASTTests.cpp
@@ -251,7 +251,7 @@ TEST(ParsedASTTest, NoCrashOnTokensWithTidyCheck) {
   // this check runs the preprocessor, we need to make sure it does not break
   // our recording logic.
   TU.ClangTidyProvider = addTidyChecks("modernize-use-trailing-return-type");
-  TU.Code = "inline int foo() {}";
+  TU.Code = "inline int foo() { return 0; }";
 
   auto AST = TU.build();
   const syntax::TokenBuffer &T = AST.getTokens();
diff --git a/clang-tools-extra/clangd/unittests/QualityTests.cpp b/clang-tools-extra/clangd/unittests/QualityTests.cpp
index 576779fa3270a..619ea32115357 100644
--- a/clang-tools-extra/clangd/unittests/QualityTests.cpp
+++ b/clang-tools-extra/clangd/unittests/QualityTests.cpp
@@ -108,7 +108,7 @@ TEST(QualityTests, SymbolRelevanceSignalExtraction) {
 
   using flags::FLAGS_FOO;
 
-  int ::header_main() {}
+  int ::header_main() { return 0; }
   int main();
 
   [[deprecated]]
diff --git a/clang-tools-extra/clangd/unittests/RenameTests.cpp b/clang-tools-extra/clangd/unittests/RenameTests.cpp
index 142ed171d1a1c..15866f43affa0 100644
--- a/clang-tools-extra/clangd/unittests/RenameTests.cpp
+++ b/clang-tools-extra/clangd/unittests/RenameTests.cpp
@@ -214,7 +214,7 @@ TEST(RenameTest, WithinFileRename) {
         template<typename T>
         class Foo {
         public:
-          static T [[f^oo]]() {}
+          static T [[f^oo]]() { return T(); }
         };
 
         void bar() {
@@ -225,7 +225,7 @@ TEST(RenameTest, WithinFileRename) {
         template<typename T>
         class Foo {
         public:
-          T [[f^oo]]() {}
+          T [[f^oo]]() { return T(); }
         };
 
         void bar() {
@@ -827,7 +827,7 @@ TEST(RenameTest, WithinFileRename) {
 
       // Issue 170: Rename symbol introduced by UsingDecl
       R"cpp(
-        namespace ns { void [[f^oo]](); } 
+        namespace ns { void [[f^oo]](); }
 
         using ns::[[f^oo]];
 
@@ -1307,7 +1307,7 @@ TEST(RenameTest, Renameable) {
        "no symbol", false},
 
       {R"cpp(// FIXME we probably want to rename both overloads here,
-             // but renaming currently assumes there's only a 
+             // but renaming currently assumes there's only a
              // single canonical declaration.
         namespace ns { int foo(int); char foo(char); }
         using ns::^foo;
@@ -1776,7 +1776,7 @@ TEST(CrossFileRenameTests, WithUpToDateIndex) {
           void [[foo]]() override {};
         };
 
-        void func(Base* b, Derived1* d1, 
+        void func(Base* b, Derived1* d1,
                   Derived2* d2, NotDerived* nd) {
           b->[[foo]]();
           d1->[[foo]]();
diff --git a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
index 1ec51d862d0a6..94cecce1f038c 100644
--- a/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticHighlightingTests.cpp
@@ -741,6 +741,7 @@ sizeof...($TemplateParameter[[Elements]]);
           $Class[[Foo]].$Field_static[[sharedInstance]].$Field[[someProperty]] $Operator[[=]] 1;
           self.$Field[[someProperty]] $Operator[[=]] self.$Field[[someProperty]] $Operator[[+]] self.$Field[[otherMethod]] $Operator[[+]] 1;
           self->$Field[[_someProperty]] $Operator[[=]] $Field[[_someProperty]] $Operator[[+]] 1;
+          return 0;
         }
         @end
       )cpp",
diff --git a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
index 7faef6f95d8f9..7ede19c321bc6 100644
--- a/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SemanticSelectionTests.cpp
@@ -201,6 +201,7 @@ TEST(FoldingRanges, ASTAll) {
       R"cpp(
         #define FOO int foo() {\
           int Variable = 42; \
+          return 0; \
         }
 
         // Do not generate folding range for braces within macro expansion.
@@ -336,18 +337,18 @@ TEST(FoldingRanges, PseudoParserWithoutLineFoldings) {
         ]]};
       )cpp",
       R"cpp(
-        /*[[ Multi 
+        /*[[ Multi
           * line
-          *  comment 
+          *  comment
           ]]*/
       )cpp",
       R"cpp(
         //[[ Comment
         // 1]]
-        
+
         //[[ Comment
         // 2]]
-        
+
         // No folding for single line comment.
 
         /*[[ comment 3
diff --git a/clang-tools-extra/clangd/unittests/SymbolInfoTests.cpp b/clang-tools-extra/clangd/unittests/SymbolInfoTests.cpp
index 6c91f3783a622..95b6eaedce97c 100644
--- a/clang-tools-extra/clangd/unittests/SymbolInfoTests.cpp
+++ b/clang-tools-extra/clangd/unittests/SymbolInfoTests.cpp
@@ -36,6 +36,7 @@ TEST(SymbolInfoTests, All) {
           void $decl[[foo]]();
           int bar() {
             fo^o();
+            return 0;
           }
         )cpp",
               {ExpectedSymbolDetails{"foo", "", "c:@F@foo#", "decl"}}},
@@ -44,6 +45,7 @@ TEST(SymbolInfoTests, All) {
           void $def[[foo]]() {}
           int bar() {
             fo^o();
+            return 0;
           }
         )cpp",
               {ExpectedSymbolDetails{"foo", "", "c:@F@foo#", "def", "def"}}},
@@ -53,6 +55,7 @@ TEST(SymbolInfoTests, All) {
           void $def[[foo]]() {}
           int bar() {
             fo^o();
+            return 0;
           }
         )cpp",
               {ExpectedSymbolDetails{"foo", "", "c:@F@foo#", "decl", "def"}}},
@@ -83,6 +86,7 @@ TEST(SymbolInfoTests, All) {
             void $decl[[foo]]();
             int baz() {
               fo^o();
+              return 0;
             }
           }
         )cpp",
@@ -96,6 +100,7 @@ TEST(SymbolInfoTests, All) {
           namespace barbar {
             int baz() {
               bar::fo^o();
+              return 0;
             }
           }
         )cpp",
@@ -108,6 +113,7 @@ TEST(SymbolInfoTests, All) {
             namespace Nbaz {
               int baz() {
                 ::fo^o();
+              return 0;
               }
             }
           }
@@ -121,6 +127,7 @@ TEST(SymbolInfoTests, All) {
           namespace barbar {
             int baz() {
               fo^o();
+              return 0;
             }
           }
         )cpp",
@@ -136,6 +143,7 @@ TEST(SymbolInfoTests, All) {
             int baz() {
               bar::BarType b;
               fo^o(b);
+              return 0;
             }
           }
         )cpp",
diff --git a/clang-tools-extra/clangd/unittests/XRefsTests.cpp b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
index 475b56b1dc230..e12d7691c58fb 100644
--- a/clang-tools-extra/clangd/unittests/XRefsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/XRefsTests.cpp
@@ -95,7 +95,7 @@ TEST(HighlightsTest, All) {
       )cpp",
 
       R"cpp(// Function
-        int [[^foo]](int) {}
+        int [[^foo]](int) { return 0; }
         int main() {
           [[foo]]([[foo]](42));
           auto *X = &[[foo]];
@@ -2140,7 +2140,7 @@ TEST(FindReferences, WithinAST) {
       )cpp",
 
       R"cpp(// Function
-        int $def[[foo]](int) {}
+        int $def[[foo]](int) { return 0; }
         int main() {
           auto *X = &$(main)[[^foo]];
           $(main)[[foo]](42);
@@ -2160,7 +2160,7 @@ TEST(FindReferences, WithinAST) {
 
       R"cpp(// Method call
         struct Foo { int $decl(Foo)[[foo]](); };
-        int Foo::$def(Foo)[[foo]]() {}
+        int Foo::$def(Foo)[[foo]]() { return 0; }
         int main() {
           Foo f;
           f.$(main)[[^foo]]();
@@ -2258,7 +2258,7 @@ TEST(FindReferences, WithinAST) {
       )cpp",
       R"cpp(// Dependent code
         template <typename T> void $decl[[foo]](T t);
-        template <typename T> void bar(T t) { $(bar)[[foo]](t); } // foo in bar is uninstantiated. 
+        template <typename T> void bar(T t) { $(bar)[[foo]](t); } // foo in bar is uninstantiated.
         void baz(int x) { $(baz)[[f^oo]](x); }
       )cpp",
       R"cpp(
@@ -2508,6 +2508,7 @@ TEST(FindReferences, ExplicitSymbols) {
         X $def(test)[[a]];
         $(test)[[a]].operator bool();
         if ($(test)[[a^]]) {} // ignore implicit conversion-operator AST node
+        return 0;
       }
     )cpp",
   };
@@ -2543,7 +2544,7 @@ TEST(FindReferences, UsedSymbolsFromInclude) {
       #define BAR 5
       int bar1();
       int bar2();
-      class Bar {};            
+      class Bar {};
     )cpp");
     TU.AdditionalFiles["system/vector"] = guard(R"cpp(
       namespace std {
@@ -2560,7 +2561,7 @@ TEST(FindReferences, UsedSymbolsFromInclude) {
     std::vector<Matcher<ReferencesResult::Reference>> ExpectedLocations;
     for (const auto &R : T.ranges())
       ExpectedLocations.push_back(AllOf(rangeIs(R), attrsAre(0u)));
-    for (const auto &P : T.points()) 
+    for (const auto &P : T.points())
       EXPECT_THAT(findReferences(AST, P, 0).References,
                   UnorderedElementsAreArray(ExpectedLocations))
           << "Failed for Refs at " << P << "\n"
@@ -2635,6 +2636,7 @@ TEST(FindReferences, NeedsIndexForMacro) {
   Annotations IndexedMain(R"cpp(
     int indexed_main() {
       int a = [[MACRO]](1);
+      return 0;
     }
   )cpp");
 
diff --git a/clang-tools-extra/clangd/unittests/tweaks/DefineInlineTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/DefineInlineTests.cpp
index 8d496b2a3ee73..5ec12396ae927 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/DefineInlineTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/DefineInlineTests.cpp
@@ -935,10 +935,11 @@ TEST_F(DefineInlineTest, AddInline) {
   // Check we put inline before cv-qualifiers.
   ExtraFiles["a.h"] = "const int foo();";
   apply(R"cpp(#include "a.h"
-              const int fo^o() {})cpp",
+              const int fo^o() { return 0; })cpp",
         &EditedFiles);
-  EXPECT_THAT(EditedFiles, testing::ElementsAre(FileWithContents(
-                               testPath("a.h"), "inline const int foo(){}")));
+  EXPECT_THAT(EditedFiles,
+              testing::ElementsAre(FileWithContents(
+                  testPath("a.h"), "inline const int foo(){ return 0; }")));
 
   // No double inline.
   ExtraFiles["a.h"] = "inline void foo();";
diff --git a/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp
index 3730ab4a87136..8da394d74b54d 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/ExpandDeducedTypeTests.cpp
@@ -69,8 +69,8 @@ TEST_F(ExpandDeducedTypeTest, Test) {
   EXPECT_THAT(apply(R"cpp(au^to s = &"foobar";)cpp"),
               StartsWith("fail: Could not expand type"));
 
-  EXPECT_EQ(apply("ns::Class * foo() { au^to c = foo(); }"),
-            "ns::Class * foo() { ns::Class * c = foo(); }");
+  EXPECT_EQ(apply("ns::Class * foo() { au^to c = foo(); return nullptr; }"),
+            "ns::Class * foo() { ns::Class * c = foo(); return nullptr; }");
   EXPECT_EQ(
       apply("void ns::Func() { au^to x = new ns::Class::Nested{}; }"),
       "void ns::Func() { ns::Class::Nested * x = new ns::Class::Nested{}; }");
diff --git a/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp b/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
index 552e693c0363a..3c65a58d6c945 100644
--- a/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
+++ b/clang-tools-extra/clangd/unittests/tweaks/ExtractVariableTests.cpp
@@ -116,6 +116,7 @@ TEST_F(ExtractVariableTest, Test) {
       struct T {
         int bar(int a = [[1]]) {
           int b = [[z]];
+          return 0;
         }
         int z = [[1]];
       } t;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/absl/strings/internal-file.h b/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/absl/strings/internal-file.h
index 31798661a80fc..b9ce1c875ed13 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/absl/strings/internal-file.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/abseil/Inputs/absl/strings/internal-file.h
@@ -10,7 +10,7 @@ std::string StringsFunction(std::string s1) { return s1; }
 class SomeContainer {};
 namespace strings_internal {
 void InternalFunction() {}
-template <class P> P InternalTemplateFunction(P a) {}
+template <class P> void InternalTemplateFunction(P a) { int; }
 } // namespace strings_internal
 
 namespace container_internal {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/boost/use-to-string.cpp b/clang-tools-extra/test/clang-tidy/checkers/boost/use-to-string.cpp
index 44ba172c2ff0b..f888c430e6883 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/boost/use-to-string.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/boost/use-to-string.cpp
@@ -18,7 +18,7 @@ T lexical_cast(const V &) {
 
 struct my_weird_type {};
 
-std::string fun(const std::string &) {}
+std::string fun(const std::string &) { return {}; }
 
 void test_to_string1() {
 
@@ -75,7 +75,7 @@ void test_to_string2() {
   fun(boost::lexical_cast<std::string>(j));
 }
 
-std::string fun(const std::wstring &) {}
+std::string fun(const std::wstring &);
 
 void test_to_wstring() {
   int a;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp
index 222577b124dce..aff13d19fd209 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-coro.cpp
@@ -1,5 +1,5 @@
 // RUN: %check_clang_tidy -std=c++20 %s bugprone-exception-escape %t -- \
-// RUN:     -- -fexceptions
+// RUN:     -- -fexceptions -Wno-error=return-type
 
 namespace std {
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-rethrow.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-rethrow.cpp
index b20333d5b0b3b..6f961a247b9d2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-rethrow.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape-rethrow.cpp
@@ -20,6 +20,7 @@ int throwsAndCallsRethrower() noexcept {
     } catch(...) {
         rethrower();
     }
+    return 1;
 }
 
 int throwsAndCallsCallsRethrower() noexcept {
@@ -29,6 +30,7 @@ int throwsAndCallsCallsRethrower() noexcept {
     } catch(...) {
         callsRethrower();
     }
+    return 1;
 }
 
 void rethrowerNoexcept() noexcept {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
index 26c443b139629..aae957dd7e090 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/exception-escape.cpp
@@ -665,6 +665,7 @@ int indirectly_recursive(int n) noexcept;
 
 int recursion_helper(int n) {
   indirectly_recursive(n);
+  return 0;
 }
 
 int indirectly_recursive(int n) noexcept {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/fold-init-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/fold-init-type.cpp
index 2a49960e02895..c813213c3dd0f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/fold-init-type.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/fold-init-type.cpp
@@ -8,24 +8,25 @@ T accumulate(InputIt first, InputIt last, T init) {
   // is instantiated. In practice this happens somewhere in the implementation
   // of `accumulate`. For tests, do it here.
   (void)*first;
+  return init;
 }
 
 template <class InputIt, class T>
-T reduce(InputIt first, InputIt last, T init) { (void)*first; }
+T reduce(InputIt first, InputIt last, T init) { (void)*first; return init; }
 template <class ExecutionPolicy, class InputIt, class T>
 T reduce(ExecutionPolicy &&policy,
-         InputIt first, InputIt last, T init) { (void)*first; }
+         InputIt first, InputIt last, T init) { (void)*first; return init; }
 
 struct parallel_execution_policy {};
 constexpr parallel_execution_policy par{};
 
 template <class InputIt1, class InputIt2, class T>
 T inner_product(InputIt1 first1, InputIt1 last1,
-                InputIt2 first2, T value) { (void)*first1; (void)*first2; }
+                InputIt2 first2, T value) { (void)*first1; (void)*first2; return value;  }
 
 template <class ExecutionPolicy, class InputIt1, class InputIt2, class T>
 T inner_product(ExecutionPolicy &&policy, InputIt1 first1, InputIt1 last1,
-                InputIt2 first2, T value) { (void)*first1; (void)*first2; }
+                InputIt2 first2, T value) { (void)*first1; (void)*first2; return value; }
 
 } // namespace std
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/inc-dec-in-conditions-bitint-no-crash.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/inc-dec-in-conditions-bitint-no-crash.c
index cfb64c10fe46c..5cfa264e42d68 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/inc-dec-in-conditions-bitint-no-crash.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/inc-dec-in-conditions-bitint-no-crash.c
@@ -5,5 +5,6 @@ _BitInt(8) v_401_0() {
     _BitInt(5) y = 0;
     16777215wb ?: ++y;
   });
+  return 0;
 }
-// CHECK-MESSAGES: warning 
+// CHECK-MESSAGES: warning
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.c
index 8b84474d3f2d3..36b1215978603 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.c
@@ -12,9 +12,9 @@ typedef struct cnd_t {
 } cnd_t;
 struct timespec {};
 
-int cnd_wait(cnd_t *cond, mtx_t *mutex){};
+int cnd_wait(cnd_t *cond, mtx_t *mutex){ return 0; };
 int cnd_timedwait(cnd_t *cond, mtx_t *mutex,
-                  const struct timespec *time_point){};
+                  const struct timespec *time_point){ return 0; };
 
 struct Node1 list_c;
 static mtx_t lock;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.cpp
index 6db92ef939fa3..d7508009e19ad 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/spuriously-wake-up-functions.cpp
@@ -90,18 +90,18 @@ class condition_variable {
   void wait(unique_lock<mutex> &lock, Predicate pred);
   template <class Clock, class Duration>
   cv_status wait_until(unique_lock<mutex> &lock,
-                       const chrono::time_point<Clock, Duration> &abs_time){};
+                       const chrono::time_point<Clock, Duration> &abs_time){ return cv_status::no_timeout; };
   template <class Clock, class Duration, class Predicate>
   bool wait_until(unique_lock<mutex> &lock,
                   const chrono::time_point<Clock, Duration> &abs_time,
-                  Predicate pred){};
+                  Predicate pred){ return false; };
   template <class Rep, class Period>
   cv_status wait_for(unique_lock<mutex> &lock,
-                     const chrono::duration<Rep, Period> &rel_time){};
+                     const chrono::duration<Rep, Period> &rel_time){ return cv_status::no_timeout; };
   template <class Rep, class Period, class Predicate>
   bool wait_for(unique_lock<mutex> &lock,
                 const chrono::duration<Rep, Period> &rel_time,
-                Predicate pred){};
+                Predicate pred){ return false; };
 };
 
 } // namespace std
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp
index 02fcab31dcf3e..ff5b256e71781 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/stringview-nullptr.cpp
@@ -27,7 +27,7 @@ class basic_string_view {
 
   constexpr basic_string_view(const basic_string_view &) {}
 
-  constexpr basic_string_view &operator=(const basic_string_view &) {}
+  constexpr basic_string_view &operator=(const basic_string_view &) { return *this; }
 };
 
 template <typename CharT>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp
index 7e1dd6b444393..c14b094f3fca3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/suspicious-string-compare.cpp
@@ -89,6 +89,8 @@ int test_warning_patterns() {
   if (strcmp(A, "a") < 0.)
     return 0;
   // CHECK-MESSAGES: [[@LINE-2]]:7: warning: function 'strcmp' has suspicious implicit cast
+
+  return 1;
 }
 
 int test_valid_patterns() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/fuchsia/default-arguments-calls.cpp b/clang-tools-extra/test/clang-tidy/checkers/fuchsia/default-arguments-calls.cpp
index 50b6d4c5676c3..ed7bcc7dacc30 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/fuchsia/default-arguments-calls.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/fuchsia/default-arguments-calls.cpp
@@ -2,7 +2,7 @@
 
 int foo(int value = 5) { return value; }
 
-int f() {
+void f() {
   foo();
   // CHECK-NOTES: [[@LINE-1]]:3: warning: calling a function that uses a default argument is disallowed [fuchsia-default-arguments-calls]
   // CHECK-NOTES: [[@LINE-5]]:9: note: default parameter was declared here
@@ -10,7 +10,7 @@ int f() {
 
 int bar(int value) { return value; }
 
-int n() {
+void n() {
   foo(0);
   bar(0);
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/fuchsia/multiple-inheritance.cpp b/clang-tools-extra/test/clang-tidy/checkers/fuchsia/multiple-inheritance.cpp
index 6ce9ce8e65536..d53b3fde7736b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/fuchsia/multiple-inheritance.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/fuchsia/multiple-inheritance.cpp
@@ -144,7 +144,7 @@ struct WithTemplBase : T {
   WithTemplBase();
 };
 
-int test_no_crash() {
+void test_no_crash() {
   auto foo = []() {};
   WithTemplBase<decltype(foo)>();
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int-std.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int-std.cpp
index 30f9b3cf1e90c..cd65de51a5ce9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int-std.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int-std.cpp
@@ -54,4 +54,5 @@ short bar(const short, unsigned short) {
 
   tmpl<short>();
 // CHECK-MESSAGES: [[@LINE-1]]:8: warning: consider replacing 'short' with 'std::int16_t'
+  return 0;
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
index ce70e79183521..39ff9b7f39634 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/upgrade-googletest-case.cpp
@@ -221,9 +221,9 @@ class FooTestInfo : public testing::TestInfo {
   // CHECK-FIXES: const char *test_suite_name() const;
 };
 
-const char *FooTestInfo::test_case_name() const {}
+const char *FooTestInfo::test_case_name() const { return nullptr; }
 // CHECK-MESSAGES: [[@LINE-1]]:26: warning: Google Test APIs named with 'case'
-// CHECK-FIXES: const char *FooTestInfo::test_suite_name() const {}
+// CHECK-FIXES: const char *FooTestInfo::test_suite_name() const { return nullptr; }
 
 class BarTestInfo : public testing::TestInfo {
 public:
@@ -491,26 +491,26 @@ class FooUnitTest : public testing::UnitTest {
   // CHECK-FIXES: const testing::TestSuite *GetTestSuite(int) const;
 };
 
-testing::TestCase *FooUnitTest::current_test_case() const {}
+testing::TestCase *FooUnitTest::current_test_case() const { return nullptr; }
 // CHECK-MESSAGES: [[@LINE-1]]:10: warning: Google Test APIs named with 'case'
 // CHECK-MESSAGES: [[@LINE-2]]:33: warning: Google Test APIs named with 'case'
-// CHECK-FIXES: testing::TestSuite *FooUnitTest::current_test_suite() const {}
-int FooUnitTest::successful_test_case_count() const {}
+// CHECK-FIXES: testing::TestSuite *FooUnitTest::current_test_suite() const { return nullptr; }
+int FooUnitTest::successful_test_case_count() const { return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:18: warning: Google Test APIs named with 'case'
-// CHECK-FIXES: int FooUnitTest::successful_test_suite_count() const {}
-int FooUnitTest::failed_test_case_count() const {}
+// CHECK-FIXES: int FooUnitTest::successful_test_suite_count() const { return 0; }
+int FooUnitTest::failed_test_case_count() const { return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:18: warning: Google Test APIs named with 'case'
-// CHECK-FIXES: int FooUnitTest::failed_test_suite_count() const {}
-int FooUnitTest::total_test_case_count() const {}
+// CHECK-FIXES: int FooUnitTest::failed_test_suite_count() const { return 0; }
+int FooUnitTest::total_test_case_count() const { return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:18: warning: Google Test APIs named with 'case'
-// CHECK-FIXES: int FooUnitTest::total_test_suite_count() const {}
-int FooUnitTest::test_case_to_run_count() const {}
+// CHECK-FIXES: int FooUnitTest::total_test_suite_count() const { return 0; }
+int FooUnitTest::test_case_to_run_count() const { return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:18: warning: Google Test APIs named with 'case'
-// CHECK-FIXES: int FooUnitTest::test_suite_to_run_count() const {}
-const testing::TestCase *FooUnitTest::GetTestCase(int) const {}
+// CHECK-FIXES: int FooUnitTest::test_suite_to_run_count() const { return 0; }
+const testing::TestCase *FooUnitTest::GetTestCase(int) const { return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:16: warning: Google Test APIs named with 'case'
 // CHECK-MESSAGES: [[@LINE-2]]:39: warning: Google Test APIs named with 'case'
-// CHECK-FIXES: const testing::TestSuite *FooUnitTest::GetTestSuite(int) const {}
+// CHECK-FIXES: const testing::TestSuite *FooUnitTest::GetTestSuite(int) const { return 0; }
 
 // Type derived from testing::TestCase
 class BarUnitTest : public testing::UnitTest {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-transform-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-transform-values.cpp
index 9a4eb010609b4..109eddc195558 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-transform-values.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-transform-values.cpp
@@ -54,8 +54,8 @@ void template_instantiation() {
 struct ConstNonConstClass {
   ConstNonConstClass();
   ConstNonConstClass(double &np_local0);
-  double nonConstMethod() {}
-  double constMethod() const {}
+  double nonConstMethod() { return 0; }
+  double constMethod() const { return 0; }
   double modifyingMethod(double &np_arg0) const;
 
   double NonConstMember;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
index 0d1ff0db58371..5efb64bca2374 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-values.cpp
@@ -283,8 +283,8 @@ void template_instantiation() {
 struct ConstNonConstClass {
   ConstNonConstClass();
   ConstNonConstClass(double &np_local0);
-  double nonConstMethod() {}
-  double constMethod() const {}
+  double nonConstMethod() { return 0; }
+  double constMethod() const { return 0; }
   double modifyingMethod(double &np_arg0) const;
 
   double NonConstMember;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp
index 524de45463e36..9b3dd070405b5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/unused-parameters.cpp
@@ -33,9 +33,9 @@ void f(void (*fn)()) {;}
 // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: parameter 'fn' is unused [misc-unused-parameters]
 // CHECK-FIXES: {{^}}void f(void (* /*fn*/)()) {;}{{$}}
 
-int *k([[clang::lifetimebound]] int *i) {;}
+int *k([[clang::lifetimebound]] int *i) { return nullptr; }
 // CHECK-MESSAGES: :[[@LINE-1]]:38: warning: parameter 'i' is unused [misc-unused-parameters]
-// CHECK-FIXES: {{^}}int *k({{\[\[clang::lifetimebound\]\]}} int * /*i*/) {;}{{$}}
+// CHECK-FIXES: {{^}}int *k({{\[\[clang::lifetimebound\]\]}} int * /*i*/) { return nullptr; }{{$}}
 
 #define ATTR_BEFORE(x) [[clang::lifetimebound]] x
 int* m(ATTR_BEFORE(const int *i)) { return nullptr; }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
index 68951fcf0aaac..abf95b857c192 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
@@ -17,25 +17,25 @@ void func_cpp_inc() {}
 // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc'
 // CHECK-FIXES: static void func_cpp_inc() {}
 
-int* func_cpp_inc_return_ptr() {}
+int* func_cpp_inc_return_ptr() { return nullptr; }
 // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc_return_ptr'
-// CHECK-FIXES: static int* func_cpp_inc_return_ptr() {}
+// CHECK-FIXES: static int* func_cpp_inc_return_ptr() { return nullptr; }
 
-const int* func_cpp_inc_return_const_ptr() {}
+const int* func_cpp_inc_return_const_ptr() { return nullptr; }
 // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_const_ptr'
-// CHECK-FIXES: static const int* func_cpp_inc_return_const_ptr() {}
+// CHECK-FIXES: static const int* func_cpp_inc_return_const_ptr() { return nullptr; }
 
-int const* func_cpp_inc_return_ptr_const() {}
+int const* func_cpp_inc_return_ptr_const() { return nullptr; }
 // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_ptr_const'
-// CHECK-FIXES: static int const* func_cpp_inc_return_ptr_const() {}
+// CHECK-FIXES: static int const* func_cpp_inc_return_ptr_const() { return nullptr; }
 
-int * const func_cpp_inc_return_const() {}
+int * const func_cpp_inc_return_const() { return nullptr; }
 // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: function 'func_cpp_inc_return_const'
-// CHECK-FIXES: static int * const func_cpp_inc_return_const() {}
+// CHECK-FIXES: static int * const func_cpp_inc_return_const() { return nullptr; }
 
-volatile const int* func_cpp_inc_return_volatile_const_ptr() {}
+volatile const int* func_cpp_inc_return_volatile_const_ptr() { return nullptr; }
 // CHECK-MESSAGES: :[[@LINE-1]]:21: warning: function 'func_cpp_inc_return_volatile_const_ptr'
-// CHECK-FIXES: static volatile const int* func_cpp_inc_return_volatile_const_ptr() {}
+// CHECK-FIXES: static volatile const int* func_cpp_inc_return_volatile_const_ptr() { return nullptr; }
 
 [[nodiscard]] void func_nodiscard() {}
 // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: function 'func_nodiscard'
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-auto/containers.h b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-auto/containers.h
index c99b7a4407d5c..2c90d762f5d8a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-auto/containers.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/Inputs/use-auto/containers.h
@@ -214,8 +214,8 @@ class map : public bidirectional_iterable<iterator<pair<key, value>>> {
 public:
   map() {}
 
-  iterator<pair<key, value>> find(const key &) {}
-  const_iterator<iterator<pair<key, value>>> find(const key &) const {}
+  iterator<pair<key, value>> find(const key &);
+  const_iterator<iterator<pair<key, value>>> find(const key &) const;
 };
 
 template <typename key, typename value>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp
index 22b24d45fe63f..0d100ffa38b27 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-bind.cpp
@@ -46,7 +46,7 @@ struct D {
   operator bool() const { return true; }
 
   void MemberFunction(int x) {}
-  int MemberFunctionWithReturn(int x) {}
+  int MemberFunctionWithReturn(int x) { return 0; }
 
   static D *create();
 };
@@ -342,7 +342,7 @@ void testCapturedSubexpressions() {
 
 struct E {
   void MemberFunction(int x) {}
-  int MemberFunctionWithReturn(int x) {}
+  int MemberFunctionWithReturn(int x) { return 0; }
   int operator()(int x, int y) const { return x + y; }
 
   void testMemberFunctions() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-c++20.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-c++20.cpp
index 1eb8ebe3d51e3..c9391e3339623 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-c++20.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-c++20.cpp
@@ -1,11 +1,11 @@
 // RUN: %check_clang_tidy -std=c++20 %s modernize-avoid-c-arrays %t
 
-int f1(int data[], int size) {
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: do not declare C-style arrays, use 'std::span' instead
+void f1(int data[], int size) {
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: do not declare C-style arrays, use 'std::span' instead
   int f4[] = {1, 2};
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: do not declare C-style arrays, use 'std::array' instead
 }
 
-int f2(int data[100]) {
-  // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: do not declare C-style arrays, use 'std::array' instead
+void f2(int data[100]) {
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: do not declare C-style arrays, use 'std::array' instead
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-main.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-main.cpp
index a0c79bb55a686..8e1890c234223 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-main.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-main.cpp
@@ -1,9 +1,13 @@
 // RUN: %check_clang_tidy -std=c++17 %s modernize-avoid-c-arrays %t
 
-int not_main(int argc, char *argv[]) {
-  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
+namespace X {
+// Not main
+int main(int argc, char *argv[]) {
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
   int f4[] = {1, 2};
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: do not declare C-style arrays, use 'std::array' instead
+  return 0;
+}
 }
 
 int main(int argc, char *argv[]) {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-three-arg-main.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-three-arg-main.cpp
index bd39f0fb4f1c8..58eced408733a 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-three-arg-main.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/avoid-c-arrays-ignores-three-arg-main.cpp
@@ -1,19 +1,23 @@
 // RUN: %check_clang_tidy -std=c++17 %s modernize-avoid-c-arrays %t
 
-int not_main(int argc, char *argv[], char *argw[]) {
-  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
-  // CHECK-MESSAGES: :[[@LINE-2]]:38: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
+namespace X {
+// Not main.
+int main(int argc, char *argv[], char *argw[]) {
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
+  // CHECK-MESSAGES: :[[@LINE-2]]:34: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
   int f4[] = {1, 2};
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: do not declare C-style arrays, use 'std::array' instead
+  return 0;
+}
 }
 
 int main(int argc, char *argv[], char *argw[]) {
   int f5[] = {1, 2};
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: do not declare C-style arrays, use 'std::array' instead
 
-  auto not_main = [](int argc, char *argv[], char *argw[]) {
-    // CHECK-MESSAGES: :[[@LINE-1]]:32: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
-    // CHECK-MESSAGES: :[[@LINE-2]]:46: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
+  auto main = [](int argc, char *argv[], char *argw[]) {
+    // CHECK-MESSAGES: :[[@LINE-1]]:28: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
+    // CHECK-MESSAGES: :[[@LINE-2]]:42: warning: do not declare C-style arrays, use 'std::array' or 'std::vector' instead
     int f6[] = {1, 2};
     // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: do not declare C-style arrays, use 'std::array' instead
   };
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp
index df2a2c1af1f54..8d1d7378e5cff 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp
@@ -170,6 +170,8 @@ const int *constArray() {
   // CHECK-FIXES: for (const int & I : ConstArr)
   // CHECK-FIXES-NEXT: if (Something)
   // CHECK-FIXES-NEXT: return &I;
+
+  return nullptr;
 }
 
 struct HasArr {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
index 3f4a14cd9bb64..e6562cd18dbab 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-emplace.cpp
@@ -49,7 +49,7 @@ class vector {
   template <typename... Args>
   void emplace_back(Args &&... args){};
   template <typename... Args>
-  iterator emplace(const_iterator pos, Args &&...args){};
+  iterator emplace(const_iterator pos, Args &&...args);
   ~vector();
 };
 
@@ -69,7 +69,7 @@ class list {
   void push_back(T &&) {}
 
   template <typename... Args>
-  iterator emplace(const_iterator pos, Args &&...args){};
+  iterator emplace(const_iterator pos, Args &&...args);
   template <typename... Args>
   void emplace_back(Args &&... args){};
   template <typename... Args>
@@ -93,7 +93,7 @@ class deque {
   void push_front(T &&) {}
 
   template <typename... Args>
-  iterator emplace(const_iterator pos, Args &&...args){};
+  iterator emplace(const_iterator pos, Args &&...args);
   template <typename... Args>
   void emplace_back(Args &&... args){};
   template <typename... Args>
@@ -116,7 +116,7 @@ class forward_list {
   template <typename... Args>
   void emplace_front(Args &&...args){};
   template <typename... Args>
-  iterator emplace_after(const_iterator pos, Args &&...args){};
+  iterator emplace_after(const_iterator pos, Args &&...args);
 };
 
 template <typename T>
@@ -131,7 +131,7 @@ class set {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename Key, typename T>
@@ -146,7 +146,7 @@ class map {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename T>
@@ -161,7 +161,7 @@ class multiset {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename Key, typename T>
@@ -176,7 +176,7 @@ class multimap {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename T>
@@ -191,7 +191,7 @@ class unordered_set {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename Key, typename T>
@@ -206,7 +206,7 @@ class unordered_map {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename T>
@@ -221,7 +221,7 @@ class unordered_multiset {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename Key, typename T>
@@ -236,7 +236,7 @@ class unordered_multimap {
   template <typename... Args>
   void emplace(Args &&...args){};
   template <typename... Args>
-  iterator emplace_hint(const_iterator pos, Args &&...args){};
+  iterator emplace_hint(const_iterator pos, Args &&...args);
 };
 
 template <typename T>
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
index 4abb9c8555970..7f737148a7cd1 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-equals-default-copy.cpp
@@ -1,6 +1,6 @@
 // RUN: %check_clang_tidy %s modernize-use-equals-default %t -- \
 // RUN:   -config="{CheckOptions: {modernize-use-equals-default.IgnoreMacros: false}}" \
-// RUN:   -- -fno-delayed-template-parsing -fexceptions
+// RUN:   -- -fno-delayed-template-parsing -fexceptions -Wno-error=return-type
 
 // Out of line definition.
 struct OL {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp
index 89d1aa48c46a3..bad8b7a8d7f08 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-override.cpp
@@ -203,13 +203,13 @@ struct InlineDefinitions : public Base {
   // CHECK-MESSAGES: :[[@LINE-2]]:16: warning: prefer using
   // CHECK-FIXES: {{^}}  void j() const override
 
-  virtual MustUseResultObject k() {}  // Has an implicit attribute.
+  virtual MustUseResultObject k();  // Has an implicit attribute.
   // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: prefer using
-  // CHECK-FIXES: {{^}}  MustUseResultObject k() override {}
+  // CHECK-FIXES: {{^}}  MustUseResultObject k() override;
 
-  virtual bool l() MUST_USE_RESULT UNUSED {}
+  virtual bool l() MUST_USE_RESULT UNUSED;
   // CHECK-MESSAGES: :[[@LINE-1]]:16: warning: prefer using
-  // CHECK-FIXES: {{^}}  bool l() override MUST_USE_RESULT UNUSED {}
+  // CHECK-FIXES: {{^}}  bool l() override MUST_USE_RESULT UNUSED;
 
   virtual void r() &
   {}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp
index 0a5a63eba2596..2af2e8949a814 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-std-format.cpp
@@ -100,7 +100,7 @@ std::string StrFormat_field_width_and_precision() {
   return s1 + s2 + s3 + s4 + s5 + s6;
 }
 
-std::string StrFormat_macros() {
+void StrFormat_macros() {
   // The function call is replaced even though it comes from a macro.
 #define FORMAT absl::StrFormat
   auto s1 = FORMAT("Hello %d", 42);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type.cpp
index d9efc006b22ef..e1f36c52a7c01 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-trailing-return-type.cpp
@@ -106,9 +106,9 @@ extern "C" int d2(int arg);
 inline int d3(int arg) noexcept(true);
 // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
 // CHECK-FIXES: {{^}}inline auto d3(int arg) noexcept(true) -> int;{{$}}
-inline int d4(int arg) try { } catch(...) { }
+inline int d4(int arg) try { return 0; } catch(...) { return 0; }
 // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
-// CHECK-FIXES: {{^}}inline auto d4(int arg) -> int try { } catch(...) { }{{$}}
+// CHECK-FIXES: {{^}}inline auto d4(int arg) -> int try { return 0; } catch(...) { return 0; }{{$}}
 int d5(int arg) throw();
 // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
 // CHECK-FIXES: {{^}}auto d5(int arg) throw() -> int;{{$}}
@@ -167,9 +167,9 @@ namespace N {
 }
 // CHECK-MESSAGES: :[[@LINE-2]]:9: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
 // CHECK-FIXES: {{^}}    auto e1() -> int;{{$}}
-int N::e1() {}
+int N::e1() { return 0; }
 // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
-// CHECK-FIXES: {{^}}auto N::e1() -> int {}{{$}}
+// CHECK-FIXES: {{^}}auto N::e1() -> int { return 0; }{{$}}
 
 //
 // Functions with unsupported return types
@@ -260,14 +260,14 @@ struct B {
     B& operator=(const B&);
 // CHECK-MESSAGES: :[[@LINE-1]]:8: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
 // CHECK-FIXES: {{^}}    auto operator=(const B&) -> B&;{{$}}
-    
+
     double base1(int, bool b);
 // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
 // CHECK-FIXES: {{^}}    auto base1(int, bool b) -> double;{{$}}
 
-    virtual double base2(int, bool b) {}
+    virtual double base2(int, bool b) { return 0; }
 // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
-// CHECK-FIXES: {{^}}    virtual auto base2(int, bool b) -> double {}{{$}}
+// CHECK-FIXES: {{^}}    virtual auto base2(int, bool b) -> double { return 0; }{{$}}
 
     virtual float base3() const = 0;
 // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
@@ -298,9 +298,9 @@ struct B {
 // CHECK-FIXES: {{^}}    virtual auto base9() const noexcept -> const char * { return ""; }{{$}}
 };
 
-double B::base1(int, bool b) {}
+double B::base1(int, bool b) { return 0; }
 // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
-// CHECK-FIXES: {{^}}auto B::base1(int, bool b) -> double {}{{$}}
+// CHECK-FIXES: {{^}}auto B::base1(int, bool b) -> double { return 0; }{{$}}
 
 struct D : B {
     virtual double f1(int, bool b) final;
@@ -311,9 +311,9 @@ struct D : B {
 // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
 // CHECK-FIXES: {{^}}    virtual auto base2(int, bool b) -> double override;{{$}}
 
-    virtual float base3() const override final { }
+    virtual float base3() const override final { return 0; }
 // CHECK-MESSAGES: :[[@LINE-1]]:19: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
-// CHECK-FIXES: {{^}}    virtual auto base3() const -> float override final { }{{$}}
+// CHECK-FIXES: {{^}}    virtual auto base3() const -> float override final { return 0; }{{$}}
 
     const char * base9() const noexcept override { return ""; }
 // CHECK-MESSAGES: :[[@LINE-1]]:18: warning: use a trailing return type for this function [modernize-use-trailing-return-type]
@@ -586,13 +586,13 @@ void c(int arg) { return; }
 struct D2 : B {
     D2();
     virtual ~D2();
-    
+
     virtual auto f1(int, bool b) -> double final;
     virtual auto base2(int, bool b) -> double override;
-    virtual auto base3() const -> float override final { }
+    virtual auto base3() const -> float override final { return 0;  }
 
     operator double();
 };
 
 auto l1 = [](int arg) {};
-auto l2 = [](int arg) -> double {};
+auto l2 = [](int arg) -> double { return 0; };
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header-fixed.h b/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header-fixed.h
index a40b2b2ece52e..1dcdd7a5ea4b4 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header-fixed.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header-fixed.h
@@ -12,4 +12,4 @@ int f1(int n, ABC v1); // line 11
 
 
 
-int f2(        int n,       const ABC& v2); // line 15
+void f2(        int n,       const ABC& v2); // line 15
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header.h b/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header.h
index 94916755ddafe..d6f6e65ace79d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/Inputs/unnecessary-value-param/header.h
@@ -12,4 +12,4 @@ int f1(int n, ABC v1); // line 11
 
 
 
-int f2(        int n,       ABC v2); // line 15
+void f2(        int n,       ABC v2); // line 15
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp
index 1dbd56b322202..a1edf5fae2f9e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/inefficient-string-concatenation.cpp
@@ -6,15 +6,15 @@ class basic_string {
 public:
   basic_string() {}
   ~basic_string() {}
-  basic_string<T> *operator+=(const basic_string<T> &) {}
-  friend basic_string<T> operator+(const basic_string<T> &, const basic_string<T> &) {}
+  basic_string<T> *operator+=(const basic_string<T> &);
+  friend basic_string<T> operator+(const basic_string<T> &, const basic_string<T> &);
 };
 typedef basic_string<char> string;
 typedef basic_string<wchar_t> wstring;
 }
 
 void f(std::string) {}
-std::string g(std::string) {}
+std::string g(std::string);
 
 int main() {
   std::string mystr1, mystr2;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-header.cpp b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-header.cpp
index 2b45bb719dbc5..8461248982447 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-header.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/performance/unnecessary-value-param-header.cpp
@@ -14,7 +14,7 @@ int f1(int n, ABC v1, ABC v2) {
   // CHECK-FIXES: int f1(int n, const ABC& v1, const ABC& v2) {
   return v1.get(n) + v2.get(n);
 }
-int f2(int n, ABC v2) {
-  // CHECK-MESSAGES: [[@LINE-1]]:19: warning: the parameter 'v2' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
-  // CHECK-FIXES: int f2(int n, const ABC& v2) {
+void f2(int n, ABC v2) {
+  // CHECK-MESSAGES: [[@LINE-1]]:20: warning: the parameter 'v2' is copied for each invocation but only used as a const reference; consider making it a const reference [performance-unnecessary-value-param]
+  // CHECK-FIXES: void f2(int n, const ABC& v2) {
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style1/header.h b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style1/header.h
index abbf7dfa48395..bbedc9b1df2dc 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style1/header.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style1/header.h
@@ -4,4 +4,4 @@ void style_first_good();
 
 void styleFirstBad();
 
-int thisIsMainLikeIgnored(int argc, const char *argv[]) {}
+int thisIsMainLikeIgnored(int argc, const char *argv[]) { return 0; }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style2/header.h b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style2/header.h
index 9d3e846a080b9..3b3b1e9508e8f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style2/header.h
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/Inputs/identifier-naming/global-style2/header.h
@@ -4,4 +4,4 @@ void STYLE_SECOND_GOOD();
 
 void styleSecondBad();
 
-int thisIsMainLikeNotIgnored(int argc, const char *argv[]) {}
+int thisIsMainLikeNotIgnored(int argc, const char *argv[]) { return 0; }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type-macros.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type-macros.cpp
index 5131011118f30..0a154c5d23d47 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type-macros.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type-macros.cpp
@@ -6,16 +6,16 @@
 
 // Regression tests involving macros
 #define CONCAT(a, b) a##b
-CONCAT(cons, t) int p22(){}
+CONCAT(cons, t) int p22(){ return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'const int' is 'const'-qu
 // We warn, but we can't give a fix
 
 #define CONSTINT const int
-CONSTINT p23() {}
+CONSTINT p23() { return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'const int' is 'const'-qu
 
 #define CONST const
-CONST int p24() {}
+CONST int p24() { return 0; }
 // CHECK-MESSAGES: [[@LINE-1]]:1: warning: return type 'const int' is 'const'-qu
 
 #define CREATE_FUNCTION()                    \
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
index 76a3555663b18..d913ab4dee9ba 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/const-return-type.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++14-or-later %s readability-const-return-type %t
+// RUN: %check_clang_tidy -std=c++14-or-later %s readability-const-return-type %t -- -- -Wno-error=return-type
 
 //  p# = positive test
 //  n# = negative test
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/convert-member-functions-to-static.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/convert-member-functions-to-static.cpp
index 5ec1f221b2207..a6b95bdb57e4d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/convert-member-functions-to-static.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/convert-member-functions-to-static.cpp
@@ -32,6 +32,7 @@ class A {
     // CHECK-MESSAGES: :[[@LINE-1]]:7: warning: method 'call_static_member' can be made static
     // CHECK-FIXES: {{^}}  static int call_static_member() {
     already_static();
+    return 0;
   }
 
   int read_static() {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
index be5ba54513c67..1771836539d86 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/identifier-naming.cpp
@@ -547,6 +547,7 @@ struct_type GlobalTypedefTestFunction(struct_type a_argument1) {
 // CHECK-FIXES: {{^}}struct_type_t GlobalTypedefTestFunction(struct_type_t a_argument1) {
     struct_type typedef_test_1;
 // CHECK-FIXES: {{^}}    struct_type_t typedef_test_1;
+  return {};
 }
 
 using my_struct_type = THIS___Structure;
@@ -777,8 +778,8 @@ STATIC_MACRO void someFunc(ValueType a_v1, const ValueType& a_v2) {}
 // CHECK-FIXES: {{^}}STATIC_MACRO void someFunc(value_type_t a_v1, const value_type_t& a_v2) {}
 STATIC_MACRO void someFunc(const ValueType** p_a_v1, ValueType (*p_a_v2)()) {}
 // CHECK-FIXES: {{^}}STATIC_MACRO void someFunc(const value_type_t** p_a_v1, value_type_t (*p_a_v2)()) {}
-STATIC_MACRO ValueType someFunc() {}
-// CHECK-FIXES: {{^}}STATIC_MACRO value_type_t someFunc() {}
+STATIC_MACRO ValueType someFunc() { return {}; }
+// CHECK-FIXES: {{^}}STATIC_MACRO value_type_t someFunc() { return {}; }
 STATIC_MACRO void someFunc(MyFunPtr, const MyFunPtr****) {}
 // CHECK-FIXES: {{^}}STATIC_MACRO void someFunc(my_fun_ptr_t, const my_fun_ptr_t****) {}
 #undef STATIC_MACRO
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp
index c4b7a77b92f0a..75f666e3e07e5 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/implicit-bool-conversion.cpp
@@ -465,7 +465,7 @@ struct S {
   // CHECK-FIXES: S(bool a, bool b, bool c) : a(static_cast<int>(a)), b(b), c(static_cast<int>(c)) {}
 };
 
-bool f(S& s) {
+void f(S& s) {
   functionTaking<bool>(s.a);
   // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: implicit conversion 'int' -> 'bool'
   // CHECK-FIXES: functionTaking<bool>(s.a != 0);
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp
index 8c6fb123ac023..50433d5d12ea9 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/named-parameter.cpp
@@ -37,8 +37,8 @@ void operator delete[](void *x) throw();
 void operator delete[](void * /*x*/) throw();
 
 struct X {
-  X operator++(int) {}
-  X operator--(int) {}
+  void operator++(int) {}
+  void operator--(int) {}
 
   X(X&) = delete;
   X &operator=(X&) = default;
@@ -86,22 +86,23 @@ void FDef2(int n, int) {}
 void FNoDef(int);
 
 class Z {};
+Z the_z;
 
-Z &operator++(Z&) {}
+Z &operator++(Z&) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
-// CHECK-FIXES: Z &operator++(Z& /*unused*/) {}
+// CHECK-FIXES: Z &operator++(Z& /*unused*/) { return the_z; }
 
-Z &operator++(Z&, int) {}
+Z &operator++(Z&, int) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
-// CHECK-FIXES: Z &operator++(Z& /*unused*/, int) {}
+// CHECK-FIXES: Z &operator++(Z& /*unused*/, int) { return the_z; }
 
-Z &operator--(Z&) {}
+Z &operator--(Z&) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
-// CHECK-FIXES: Z &operator--(Z& /*unused*/) {}
+// CHECK-FIXES: Z &operator--(Z& /*unused*/) { return the_z; }
 
-Z &operator--(Z&, int) {}
+Z &operator--(Z&, int) { return the_z; }
 // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: all parameters should be named in a function
-// CHECK-FIXES: Z &operator--(Z& /*unused*/, int) {}
+// CHECK-FIXES: Z &operator--(Z& /*unused*/, int) { return the_z; }
 
 namespace testing {
 namespace internal {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.c b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.c
index c2e8bf68b4ad7..dbcc4cf6d1022 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.c
@@ -20,7 +20,7 @@ static int f(void);
 static int f(void); // f
 // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: redundant 'f' declaration
 // CHECK-FIXES: {{^}}// f{{$}}
-static int f(void) {}
+static int f(void) { return 0; }
 
 inline void g(void) {}
 
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.cpp
index be505f55b86b0..595eccf8854ba 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-declaration.cpp
@@ -38,7 +38,7 @@ static int f();
 static int f(); // f
 // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: redundant 'f' declaration
 // CHECK-FIXES: {{^}}// f{{$}}
-static int f() {}
+static int f() { return 0; }
 
 // Original check crashed for the code below.
 namespace std {
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp
index 202fe9be6d00c..a0d51dec7f32d 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/static-accessed-through-instance.cpp
@@ -264,7 +264,7 @@ struct Qptr {
   }
 };
 
-int func(Qptr qp) {
+void func(Qptr qp) {
   qp->y = 10;
   qp->K = 10;
   // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: static member accessed through instance [readability-static-accessed-through-instance]
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/suspicious-call-argument.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/suspicious-call-argument.cpp
index edd3591517af3..27db92be21f20 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability/suspicious-call-argument.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/suspicious-call-argument.cpp
@@ -382,7 +382,7 @@ enum opcode { Foo,
               Bar };
 static value *SimplifyRightShift(
     opcode Opcode, value *Op0, value *Op1, bool isExact,
-    const type1 &Q, unsigned MaxRecurse) {}
+    const type1 &Q, unsigned MaxRecurse) { return nullptr; }
 static value *SimplifyLShrInst(value *Op0, value *Op1, bool isExact,
                                const type1 &Q, unsigned MaxRecurse) {
   if (value *V = SimplifyRightShift(Foo, Op0, Op1, isExact, Q, MaxRecurse))
diff --git a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp
index f67c20635064a..ff216298cfd60 100644
--- a/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp
+++ b/clang-tools-extra/test/clang-tidy/infrastructure/duplicate-fixes-of-alias-checkers.cpp
@@ -31,7 +31,7 @@ class Foo {
   // CHECK-FIXES: _num2{};
 };
 
-int should_use_emplace(std::vector<Foo> &v) {
+void should_use_emplace(std::vector<Foo> &v) {
   v.push_back(Foo());
   // CHECK-FIXES: v.emplace_back();
   // CHECK-MESSAGES: warning: use emplace_back instead of push_back [hicpp-use-emplace,modernize-use-emplace]
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index db42fc5cc0da7..e1c61992512b5 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -144,6 +144,9 @@ Improvements to Clang's diagnostics
 - Fixed a bug where Clang's Analysis did not correctly model the destructor behavior of ``union`` members (#GH119415).
 - A statement attribute applied to a ``case`` label no longer suppresses
   'bypassing variable initialization' diagnostics (#84072).
+- The ``-Wunsafe-buffer-usage`` warning has been updated to warn
+  about unsafe libc function calls.  Those new warnings are emitted
+  under the subgroup ``-Wunsafe-buffer-usage-in-libc-call``.
 
 Improvements to Clang's time-trace
 ----------------------------------
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index f305cbbce4c60..0f96bf0762ca4 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -5039,6 +5039,11 @@ class HLSLBufferDecl final : public NamedDecl, public DeclContext {
   SourceLocation KwLoc;
   /// IsCBuffer - Whether the buffer is a cbuffer (and not a tbuffer).
   bool IsCBuffer;
+  /// HasValidPackoffset - Whether the buffer has valid packoffset annotations
+  //                       on all declarations
+  bool HasValidPackoffset;
+  // LayoutStruct - Layout struct for the buffer
+  CXXRecordDecl *LayoutStruct;
 
   HLSLBufferDecl(DeclContext *DC, bool CBuffer, SourceLocation KwLoc,
                  IdentifierInfo *ID, SourceLocation IDLoc,
@@ -5059,6 +5064,10 @@ class HLSLBufferDecl final : public NamedDecl, public DeclContext {
   SourceLocation getRBraceLoc() const { return RBraceLoc; }
   void setRBraceLoc(SourceLocation L) { RBraceLoc = L; }
   bool isCBuffer() const { return IsCBuffer; }
+  void setHasValidPackoffset(bool PO) { HasValidPackoffset = PO; }
+  bool hasValidPackoffset() const { return HasValidPackoffset; }
+  const CXXRecordDecl *getLayoutStruct() const { return LayoutStruct; }
+  void addLayoutStruct(CXXRecordDecl *LS);
 
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Decl *D) { return classofKind(D->getKind()); }
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 1d9743520654e..c3ff7ebd88516 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -6266,8 +6266,8 @@ class HLSLAttributedResourceType : public Type, public llvm::FoldingSetNode {
     LLVM_PREFERRED_TYPE(bool)
     uint8_t RawBuffer : 1;
 
-    Attributes(llvm::dxil::ResourceClass ResourceClass, bool IsROV,
-               bool RawBuffer)
+    Attributes(llvm::dxil::ResourceClass ResourceClass, bool IsROV = false,
+               bool RawBuffer = false)
         : ResourceClass(ResourceClass), IsROV(IsROV), RawBuffer(RawBuffer) {}
 
     Attributes() : Attributes(llvm::dxil::ResourceClass::UAV, false, false) {}
diff --git a/clang/include/clang/Basic/TargetInfo.h b/clang/include/clang/Basic/TargetInfo.h
index 070cc792ca7db..db23afa6d6f0b 100644
--- a/clang/include/clang/Basic/TargetInfo.h
+++ b/clang/include/clang/Basic/TargetInfo.h
@@ -199,6 +199,10 @@ struct TransferrableTargetInfo {
   /// zero length bitfield, regardless of the zero length bitfield type.
   unsigned ZeroLengthBitfieldBoundary;
 
+  /// The largest container size which should be used for an over-sized
+  /// bitfield, in bits.
+  unsigned LargestOverSizedBitfieldContainer;
+
   /// If non-zero, specifies a maximum alignment to truncate alignment
   /// specified in the aligned attribute of a static variable to this value.
   unsigned MaxAlignedAttribute;
@@ -954,6 +958,10 @@ class TargetInfo : public TransferrableTargetInfo,
     return ZeroLengthBitfieldBoundary;
   }
 
+  unsigned getLargestOverSizedBitfieldContainer() const {
+    return LargestOverSizedBitfieldContainer;
+  }
+
   /// Get the maximum alignment in bits for a static variable with
   /// aligned attribute.
   unsigned getMaxAlignedAttribute() const { return MaxAlignedAttribute; }
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 610207cf8b9a4..5a3be1690f335 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1747,6 +1747,10 @@ void NamedDecl::printNestedNameSpecifier(raw_ostream &OS,
       }
     }
 
+    // Suppress transparent contexts like export or HLSLBufferDecl context
+    if (Ctx->isTransparentContext())
+      continue;
+
     // Skip non-named contexts such as linkage specifications and ExportDecls.
     const NamedDecl *ND = dyn_cast<NamedDecl>(Ctx);
     if (!ND)
@@ -5717,7 +5721,7 @@ HLSLBufferDecl::HLSLBufferDecl(DeclContext *DC, bool CBuffer,
                                SourceLocation IDLoc, SourceLocation LBrace)
     : NamedDecl(Decl::Kind::HLSLBuffer, DC, IDLoc, DeclarationName(ID)),
       DeclContext(Decl::Kind::HLSLBuffer), LBraceLoc(LBrace), KwLoc(KwLoc),
-      IsCBuffer(CBuffer) {}
+      IsCBuffer(CBuffer), HasValidPackoffset(false), LayoutStruct(nullptr) {}
 
 HLSLBufferDecl *HLSLBufferDecl::Create(ASTContext &C,
                                        DeclContext *LexicalParent, bool CBuffer,
@@ -5747,6 +5751,12 @@ HLSLBufferDecl *HLSLBufferDecl::CreateDeserialized(ASTContext &C,
                                     SourceLocation(), SourceLocation());
 }
 
+void HLSLBufferDecl::addLayoutStruct(CXXRecordDecl *LS) {
+  assert(LayoutStruct == nullptr && "layout struct has already been set");
+  LayoutStruct = LS;
+  addDecl(LS);
+}
+
 //===----------------------------------------------------------------------===//
 // ImportDecl Implementation
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/AST/RecordLayoutBuilder.cpp b/clang/lib/AST/RecordLayoutBuilder.cpp
index 3e38ba0a43d98..b8600e6a344a4 100644
--- a/clang/lib/AST/RecordLayoutBuilder.cpp
+++ b/clang/lib/AST/RecordLayoutBuilder.cpp
@@ -1469,15 +1469,18 @@ void ItaniumRecordLayoutBuilder::LayoutWideBitField(uint64_t FieldSize,
   //   sizeof(T')*8 <= n.
 
   QualType IntegralPODTypes[] = {
-    Context.UnsignedCharTy, Context.UnsignedShortTy, Context.UnsignedIntTy,
-    Context.UnsignedLongTy, Context.UnsignedLongLongTy
+      Context.UnsignedCharTy,     Context.UnsignedShortTy,
+      Context.UnsignedIntTy,      Context.UnsignedLongTy,
+      Context.UnsignedLongLongTy, Context.UnsignedInt128Ty,
   };
 
   QualType Type;
+  uint64_t MaxSize =
+      Context.getTargetInfo().getLargestOverSizedBitfieldContainer();
   for (const QualType &QT : IntegralPODTypes) {
     uint64_t Size = Context.getTypeSize(QT);
 
-    if (Size > FieldSize)
+    if (Size > FieldSize || Size > MaxSize)
       break;
 
     Type = QT;
@@ -1520,6 +1523,7 @@ void ItaniumRecordLayoutBuilder::LayoutWideBitField(uint64_t FieldSize,
   setSize(std::max(getSizeInBits(), getDataSizeInBits()));
 
   // Remember max struct/class alignment.
+  UnadjustedAlignment = std::max(UnadjustedAlignment, TypeAlign);
   UpdateAlignment(TypeAlign);
 }
 
diff --git a/clang/lib/Basic/TargetInfo.cpp b/clang/lib/Basic/TargetInfo.cpp
index c0bf4e686cf03..0699ec686e4e6 100644
--- a/clang/lib/Basic/TargetInfo.cpp
+++ b/clang/lib/Basic/TargetInfo.cpp
@@ -141,6 +141,7 @@ TargetInfo::TargetInfo(const llvm::Triple &T) : Triple(T) {
   UseLeadingZeroLengthBitfield = true;
   UseExplicitBitFieldAlignment = true;
   ZeroLengthBitfieldBoundary = 0;
+  LargestOverSizedBitfieldContainer = 64;
   MaxAlignedAttribute = 0;
   HalfFormat = &llvm::APFloat::IEEEhalf();
   FloatFormat = &llvm::APFloat::IEEEsingle();
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index fad8d773bfc52..3633bab6e0df9 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -261,6 +261,10 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
   assert(UseBitFieldTypeAlignment && "bitfields affect type alignment");
   UseZeroLengthBitfieldAlignment = true;
 
+  // AAPCS64 allows any "fundamental integer data type" to be used for
+  // over-sized bitfields, which includes 128-bit integers.
+  LargestOverSizedBitfieldContainer = 128;
+
   HasUnalignedAccess = true;
 
   // AArch64 targets default to using the ARM C++ ABI.
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 856d8b1b2948d..547220fb1f1e1 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -17,16 +17,22 @@
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
 #include "TargetInfo.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Decl.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/AST/Type.h"
 #include "clang/Basic/TargetOptions.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/Alignment.h"
 
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
 
 using namespace clang;
@@ -34,6 +40,9 @@ using namespace CodeGen;
 using namespace clang::hlsl;
 using namespace llvm;
 
+static void createResourceInitFn(CodeGenModule &CGM, llvm::GlobalVariable *GV,
+                                 unsigned Slot, unsigned Space);
+
 namespace {
 
 void addDxilValVersion(StringRef ValVersionStr, llvm::Module &M) {
@@ -56,58 +65,17 @@ void addDxilValVersion(StringRef ValVersionStr, llvm::Module &M) {
   auto *DXILValMD = M.getOrInsertNamedMetadata(DXILValKey);
   DXILValMD->addOperand(Val);
 }
-// cbuffer will be translated into global variable in special address space.
-// If translate into C,
-// cbuffer A {
-//   float a;
-//   float b;
-// }
-// float foo() { return a + b; }
-//
-// will be translated into
-//
-// struct A {
-//   float a;
-//   float b;
-// } cbuffer_A __attribute__((address_space(4)));
-// float foo() { return cbuffer_A.a + cbuffer_A.b; }
-//
-// layoutBuffer will create the struct A type.
-// replaceBuffer will replace use of global variable a and b with cbuffer_A.a
-// and cbuffer_A.b.
-//
-void layoutBuffer(CGHLSLRuntime::Buffer &Buf, const DataLayout &DL) {
-  if (Buf.Constants.empty())
-    return;
-
-  std::vector<llvm::Type *> EltTys;
-  for (auto &Const : Buf.Constants) {
-    GlobalVariable *GV = Const.first;
-    Const.second = EltTys.size();
-    llvm::Type *Ty = GV->getValueType();
-    EltTys.emplace_back(Ty);
-  }
-  Buf.LayoutStruct = llvm::StructType::get(EltTys[0]->getContext(), EltTys);
-}
-
-GlobalVariable *replaceBuffer(CGHLSLRuntime::Buffer &Buf) {
-  // Create global variable for CB.
-  GlobalVariable *CBGV = new GlobalVariable(
-      Buf.LayoutStruct, /*isConstant*/ true,
-      GlobalValue::LinkageTypes::ExternalLinkage, nullptr,
-      llvm::formatv("{0}{1}", Buf.Name, Buf.IsCBuffer ? ".cb." : ".tb."),
-      GlobalValue::NotThreadLocal);
-
-  return CBGV;
-}
 
 } // namespace
 
-llvm::Type *CGHLSLRuntime::convertHLSLSpecificType(const Type *T) {
+llvm::Type *
+CGHLSLRuntime::convertHLSLSpecificType(const Type *T,
+                                       SmallVector<unsigned> *Packoffsets) {
   assert(T->isHLSLSpecificType() && "Not an HLSL specific type!");
 
   // Check if the target has a specific translation for this type first.
-  if (llvm::Type *TargetTy = CGM.getTargetCodeGenInfo().getHLSLType(CGM, T))
+  if (llvm::Type *TargetTy =
+          CGM.getTargetCodeGenInfo().getHLSLType(CGM, T, Packoffsets))
     return TargetTy;
 
   llvm_unreachable("Generic handling of HLSL types is not supported.");
@@ -117,48 +85,174 @@ llvm::Triple::ArchType CGHLSLRuntime::getArch() {
   return CGM.getTarget().getTriple().getArch();
 }
 
-void CGHLSLRuntime::addConstant(VarDecl *D, Buffer &CB) {
-  if (D->getStorageClass() == SC_Static) {
-    // For static inside cbuffer, take as global static.
-    // Don't add to cbuffer.
-    CGM.EmitGlobal(D);
-    return;
-  }
+// Returns true if the type is an HLSL resource class
+static bool isResourceRecordType(const clang::Type *Ty) {
+  return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr;
+}
 
-  auto *GV = cast<GlobalVariable>(CGM.GetAddrOfGlobalVar(D));
-  GV->setExternallyInitialized(true);
-  // Add debug info for constVal.
-  if (CGDebugInfo *DI = CGM.getModuleDebugInfo())
-    if (CGM.getCodeGenOpts().getDebugInfo() >=
-        codegenoptions::DebugInfoKind::LimitedDebugInfo)
-      DI->EmitGlobalVariable(cast<GlobalVariable>(GV), D);
-
-  // FIXME: support packoffset.
-  // See https://github.com/llvm/llvm-project/issues/57914.
-  uint32_t Offset = 0;
-  bool HasUserOffset = false;
-
-  unsigned LowerBound = HasUserOffset ? Offset : UINT_MAX;
-  CB.Constants.emplace_back(std::make_pair(GV, LowerBound));
+// Returns true if the type is an HLSL resource class or an array of them
+static bool isResourceRecordTypeOrArrayOf(const clang::Type *Ty) {
+  while (const ConstantArrayType *CAT = dyn_cast<ConstantArrayType>(Ty))
+    Ty = CAT->getArrayElementTypeNoTypeQual();
+  return isResourceRecordType(Ty);
 }
 
-void CGHLSLRuntime::addBufferDecls(const DeclContext *DC, Buffer &CB) {
-  for (Decl *it : DC->decls()) {
-    if (auto *ConstDecl = dyn_cast<VarDecl>(it)) {
-      addConstant(ConstDecl, CB);
-    } else if (isa<CXXRecordDecl, EmptyDecl>(it)) {
+// Emits constant global variables for buffer constants declarations
+// and creates metadata linking the constant globals with the buffer global.
+void CGHLSLRuntime::emitBufferGlobalsAndMetadata(const HLSLBufferDecl *BufDecl,
+                                                 llvm::GlobalVariable *BufGV) {
+  LLVMContext &Ctx = CGM.getLLVMContext();
+
+  // get the layout struct from constant buffer target type
+  llvm::Type *BufType = BufGV->getValueType();
+  llvm::Type *BufLayoutType =
+      cast<llvm::TargetExtType>(BufType)->getTypeParameter(0);
+  llvm::StructType *LayoutStruct = cast<llvm::StructType>(
+      cast<llvm::TargetExtType>(BufLayoutType)->getTypeParameter(0));
+
+  // Start metadata list associating the buffer global variable with its
+  // constatns
+  SmallVector<llvm::Metadata *> BufGlobals;
+  BufGlobals.push_back(ValueAsMetadata::get(BufGV));
+
+  const auto *ElemIt = LayoutStruct->element_begin();
+  for (Decl *D : BufDecl->decls()) {
+    if (isa<CXXRecordDecl, EmptyDecl>(D))
       // Nothing to do for this declaration.
-    } else if (isa<FunctionDecl>(it)) {
-      // A function within an cbuffer is effectively a top-level function,
-      // as it only refers to globally scoped declarations.
-      CGM.EmitTopLevelDecl(it);
+      continue;
+    if (isa<FunctionDecl>(D)) {
+      // A function within an cbuffer is effectively a top-level function.
+      CGM.EmitTopLevelDecl(D);
+      continue;
     }
+    VarDecl *VD = dyn_cast<VarDecl>(D);
+    if (!VD)
+      continue;
+
+    QualType VDTy = VD->getType();
+    if (VDTy.getAddressSpace() != LangAS::hlsl_constant) {
+      if (VD->getStorageClass() == SC_Static ||
+          VDTy.getAddressSpace() == LangAS::hlsl_groupshared ||
+          isResourceRecordTypeOrArrayOf(VDTy.getTypePtr())) {
+        // Emit static and groupshared variables and resource classes inside
+        // cbuffer as regular globals
+        CGM.EmitGlobal(VD);
+      } else {
+        // Anything else that is not in the hlsl_constant address space must be
+        // an empty struct or a zero-sized array and can be ignored
+        assert(BufDecl->getASTContext().getTypeSize(VDTy) == 0 &&
+               "constant buffer decl with non-zero sized type outside of "
+               "hlsl_constant address space");
+      }
+      continue;
+    }
+
+    assert(ElemIt != LayoutStruct->element_end() &&
+           "number of elements in layout struct does not match");
+    llvm::Type *LayoutType = *ElemIt++;
+
+    // FIXME: handle resources inside user defined structs
+    // (llvm/wg-hlsl#175)
+
+    // create global variable for the constant and to metadata list
+    GlobalVariable *ElemGV =
+        cast<GlobalVariable>(CGM.GetAddrOfGlobalVar(VD, LayoutType));
+    BufGlobals.push_back(ValueAsMetadata::get(ElemGV));
   }
+  assert(ElemIt == LayoutStruct->element_end() &&
+         "number of elements in layout struct does not match");
+
+  // add buffer metadata to the module
+  CGM.getModule()
+      .getOrInsertNamedMetadata("hlsl.cbs")
+      ->addOperand(MDNode::get(Ctx, BufGlobals));
 }
 
-void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *D) {
-  Buffers.emplace_back(Buffer(D));
-  addBufferDecls(D, Buffers.back());
+// Creates resource handle type for the HLSL buffer declaration
+static const clang::HLSLAttributedResourceType *
+createBufferHandleType(const HLSLBufferDecl *BufDecl) {
+  ASTContext &AST = BufDecl->getASTContext();
+  QualType QT = AST.getHLSLAttributedResourceType(
+      AST.HLSLResourceTy,
+      QualType(BufDecl->getLayoutStruct()->getTypeForDecl(), 0),
+      HLSLAttributedResourceType::Attributes(ResourceClass::CBuffer));
+  return cast<HLSLAttributedResourceType>(QT.getTypePtr());
+}
+
+static void fillPackoffsetLayout(const HLSLBufferDecl *BufDecl,
+                                 SmallVector<unsigned> &Layout) {
+  assert(Layout.empty() && "expected empty vector for layout");
+  assert(BufDecl->hasValidPackoffset());
+
+  for (Decl *D : BufDecl->decls()) {
+    if (isa<CXXRecordDecl, EmptyDecl>(D) || isa<FunctionDecl>(D)) {
+      continue;
+    }
+    VarDecl *VD = dyn_cast<VarDecl>(D);
+    if (!VD || VD->getType().getAddressSpace() != LangAS::hlsl_constant)
+      continue;
+    assert(VD->hasAttr<HLSLPackOffsetAttr>() &&
+           "expected packoffset attribute on every declaration");
+    size_t Offset = VD->getAttr<HLSLPackOffsetAttr>()->getOffsetInBytes();
+    Layout.push_back(Offset);
+  }
+}
+
+// Codegen for HLSLBufferDecl
+void CGHLSLRuntime::addBuffer(const HLSLBufferDecl *BufDecl) {
+
+  assert(BufDecl->isCBuffer() && "tbuffer codegen is not supported yet");
+
+  // create resource handle type for the buffer
+  const clang::HLSLAttributedResourceType *ResHandleTy =
+      createBufferHandleType(BufDecl);
+
+  // empty constant buffer is ignored
+  if (ResHandleTy->getContainedType()->getAsCXXRecordDecl()->isEmpty())
+    return;
+
+  // create global variable for the constant buffer
+  SmallVector<unsigned> Layout;
+  if (BufDecl->hasValidPackoffset())
+    fillPackoffsetLayout(BufDecl, Layout);
+
+  llvm::TargetExtType *TargetTy =
+      cast<llvm::TargetExtType>(convertHLSLSpecificType(
+          ResHandleTy, BufDecl->hasValidPackoffset() ? &Layout : nullptr));
+  llvm::GlobalVariable *BufGV =
+      new GlobalVariable(TargetTy, /*isConstant*/ true,
+                         GlobalValue::LinkageTypes::ExternalLinkage, nullptr,
+                         llvm::formatv("{0}{1}", BufDecl->getName(),
+                                       BufDecl->isCBuffer() ? ".cb" : ".tb"),
+                         GlobalValue::NotThreadLocal);
+  CGM.getModule().insertGlobalVariable(BufGV);
+
+  // Add globals for constant buffer elements and create metadata nodes
+  emitBufferGlobalsAndMetadata(BufDecl, BufGV);
+
+  // Resource initialization
+  const HLSLResourceBindingAttr *RBA =
+      BufDecl->getAttr<HLSLResourceBindingAttr>();
+  // FIXME: handle implicit binding if no binding attribute is found
+  // (llvm/llvm-project#110722)
+  if (RBA)
+    createResourceInitFn(CGM, BufGV, RBA->getSlotNumber(),
+                         RBA->getSpaceNumber());
+}
+
+llvm::TargetExtType *
+CGHLSLRuntime::getHLSLBufferLayoutType(const RecordType *StructType) {
+  const auto Entry = LayoutTypes.find(StructType);
+  if (Entry != LayoutTypes.end())
+    return Entry->getSecond();
+  return nullptr;
+}
+
+void CGHLSLRuntime::addHLSLBufferLayoutType(const RecordType *StructType,
+                                            llvm::TargetExtType *LayoutTy) {
+  assert(getHLSLBufferLayoutType(StructType) == nullptr &&
+         "layout type for this struct already exist");
+  LayoutTypes[StructType] = LayoutTy;
 }
 
 void CGHLSLRuntime::finishCodeGen() {
@@ -169,28 +263,8 @@ void CGHLSLRuntime::finishCodeGen() {
     addDxilValVersion(TargetOpts.DxilValidatorVersion, M);
 
   generateGlobalCtorDtorCalls();
-
-  const DataLayout &DL = M.getDataLayout();
-
-  for (auto &Buf : Buffers) {
-    layoutBuffer(Buf, DL);
-    GlobalVariable *GV = replaceBuffer(Buf);
-    M.insertGlobalVariable(GV);
-    llvm::hlsl::ResourceClass RC = Buf.IsCBuffer
-                                       ? llvm::hlsl::ResourceClass::CBuffer
-                                       : llvm::hlsl::ResourceClass::SRV;
-    llvm::hlsl::ResourceKind RK = Buf.IsCBuffer
-                                      ? llvm::hlsl::ResourceKind::CBuffer
-                                      : llvm::hlsl::ResourceKind::TBuffer;
-    addBufferResourceAnnotation(GV, RC, RK, /*IsROV=*/false,
-                                llvm::hlsl::ElementType::Invalid, Buf.Binding);
-  }
 }
 
-CGHLSLRuntime::Buffer::Buffer(const HLSLBufferDecl *D)
-    : Name(D->getName()), IsCBuffer(D->isCBuffer()),
-      Binding(D->getAttr<HLSLResourceBindingAttr>()) {}
-
 void CGHLSLRuntime::addBufferResourceAnnotation(llvm::GlobalVariable *GV,
                                                 llvm::hlsl::ResourceClass RC,
                                                 llvm::hlsl::ResourceKind RK,
@@ -524,21 +598,15 @@ void CGHLSLRuntime::generateGlobalCtorDtorCalls() {
   }
 }
 
-// Returns true if the type is an HLSL resource class
-static bool isResourceRecordType(const clang::Type *Ty) {
-  return HLSLAttributedResourceType::findHandleTypeOnResource(Ty) != nullptr;
-}
-
-static void createResourceInitFn(CodeGenModule &CGM, const VarDecl *VD,
-                                 llvm::GlobalVariable *GV, unsigned Slot,
-                                 unsigned Space) {
+static void createResourceInitFn(CodeGenModule &CGM, llvm::GlobalVariable *GV,
+                                 unsigned Slot, unsigned Space) {
   LLVMContext &Ctx = CGM.getLLVMContext();
   llvm::Type *Int1Ty = llvm::Type::getInt1Ty(Ctx);
 
   llvm::Function *InitResFunc = llvm::Function::Create(
       llvm::FunctionType::get(CGM.VoidTy, false),
       llvm::GlobalValue::InternalLinkage,
-      ("_init_resource_" + VD->getName()).str(), CGM.getModule());
+      ("_init_resource_" + GV->getName()).str(), CGM.getModule());
   InitResFunc->addFnAttr(llvm::Attribute::AlwaysInline);
 
   llvm::BasicBlock *EntryBB =
@@ -547,20 +615,15 @@ static void createResourceInitFn(CodeGenModule &CGM, const VarDecl *VD,
   const DataLayout &DL = CGM.getModule().getDataLayout();
   Builder.SetInsertPoint(EntryBB);
 
-  const HLSLAttributedResourceType *AttrResType =
-      HLSLAttributedResourceType::findHandleTypeOnResource(
-          VD->getType().getTypePtr());
-
-  // FIXME: Only simple declarations of resources are supported for now.
-  // Arrays of resources or resources in user defined classes are
-  // not implemented yet.
-  assert(AttrResType != nullptr &&
-         "Resource class must have a handle of HLSLAttributedResourceType");
-
-  llvm::Type *TargetTy =
-      CGM.getTargetCodeGenInfo().getHLSLType(CGM, AttrResType);
-  assert(TargetTy != nullptr &&
-         "Failed to convert resource handle to target type");
+  // Make sure the global variable is resource handle (cbuffer) or
+  // resource class (=class where the first element is a resource handle).
+  llvm::Type *HandleTy = GV->getValueType();
+  assert((HandleTy->isTargetExtTy() ||
+          (HandleTy->isStructTy() &&
+           HandleTy->getStructElementType(0)->isTargetExtTy())) &&
+         "unexpected type of the global");
+  if (!HandleTy->isTargetExtTy())
+    HandleTy = HandleTy->getStructElementType(0);
 
   llvm::Value *Args[] = {
       llvm::ConstantInt::get(CGM.IntTy, Space), /* reg_space */
@@ -572,9 +635,9 @@ static void createResourceInitFn(CodeGenModule &CGM, const VarDecl *VD,
       llvm::ConstantInt::get(Int1Ty, false) /* non-uniform */
   };
   llvm::Value *CreateHandle = Builder.CreateIntrinsic(
-      /*ReturnType=*/TargetTy,
+      /*ReturnType=*/HandleTy,
       CGM.getHLSLRuntime().getCreateHandleFromBindingIntrinsic(), Args, nullptr,
-      Twine(VD->getName()).concat("_h"));
+      Twine(GV->getName()).concat("_h"));
 
   llvm::Value *HandleRef = Builder.CreateStructGEP(GV->getValueType(), GV, 0);
   Builder.CreateAlignedStore(CreateHandle, HandleRef,
@@ -601,8 +664,7 @@ void CGHLSLRuntime::handleGlobalVarDefinition(const VarDecl *VD,
     // not implemented yet.
     return;
 
-  createResourceInitFn(CGM, VD, GV, RBA->getSlotNumber(),
-                       RBA->getSpaceNumber());
+  createResourceInitFn(CGM, GV, RBA->getSlotNumber(), RBA->getSpaceNumber());
 }
 
 llvm::Instruction *CGHLSLRuntime::getConvergenceToken(BasicBlock &BB) {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 8767a2ddceb96..a9da42324a038 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CLANG_LIB_CODEGEN_CGHLSLRUNTIME_H
 #define LLVM_CLANG_LIB_CODEGEN_CGHLSLRUNTIME_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsDirectX.h"
@@ -46,20 +47,26 @@
     }                                                                          \
   }
 
+using ResourceClass = llvm::dxil::ResourceClass;
+
 namespace llvm {
 class GlobalVariable;
 class Function;
 class StructType;
+class Metadata;
 } // namespace llvm
 
 namespace clang {
+class NamedDecl;
 class VarDecl;
 class ParmVarDecl;
 class InitListExpr;
 class HLSLBufferDecl;
 class HLSLResourceBindingAttr;
 class Type;
+class RecordType;
 class DeclContext;
+class HLSLPackOffsetAttr;
 
 class FunctionDecl;
 
@@ -126,16 +133,6 @@ class CGHLSLRuntime {
     unsigned Space;
     BufferResBinding(HLSLResourceBindingAttr *Attr);
   };
-  struct Buffer {
-    Buffer(const HLSLBufferDecl *D);
-    llvm::StringRef Name;
-    // IsCBuffer - Whether the buffer is a cbuffer (and not a tbuffer).
-    bool IsCBuffer;
-    BufferResBinding Binding;
-    // Global variable and offset for each constant.
-    std::vector<std::pair<llvm::GlobalVariable *, unsigned>> Constants;
-    llvm::StructType *LayoutStruct = nullptr;
-  };
 
 protected:
   CodeGenModule &CGM;
@@ -147,7 +144,9 @@ class CGHLSLRuntime {
   CGHLSLRuntime(CodeGenModule &CGM) : CGM(CGM) {}
   virtual ~CGHLSLRuntime() {}
 
-  llvm::Type *convertHLSLSpecificType(const Type *T);
+  llvm::Type *
+  convertHLSLSpecificType(const Type *T,
+                          SmallVector<unsigned> *Packoffsets = nullptr);
 
   void annotateHLSLResource(const VarDecl *D, llvm::GlobalVariable *GV);
   void generateGlobalCtorDtorCalls();
@@ -163,6 +162,10 @@ class CGHLSLRuntime {
 
   llvm::Instruction *getConvergenceToken(llvm::BasicBlock &BB);
 
+  llvm::TargetExtType *
+  getHLSLBufferLayoutType(const RecordType *LayoutStructTy);
+  void addHLSLBufferLayoutType(const RecordType *LayoutStructTy,
+                               llvm::TargetExtType *LayoutTy);
   void emitInitListOpaqueValues(CodeGenFunction &CGF, InitListExpr *E);
 
 private:
@@ -171,10 +174,11 @@ class CGHLSLRuntime {
                                    llvm::hlsl::ResourceKind RK, bool IsROV,
                                    llvm::hlsl::ElementType ET,
                                    BufferResBinding &Binding);
-  void addConstant(VarDecl *D, Buffer &CB);
-  void addBufferDecls(const DeclContext *DC, Buffer &CB);
+  void emitBufferGlobalsAndMetadata(const HLSLBufferDecl *BufDecl,
+                                    llvm::GlobalVariable *BufGV);
   llvm::Triple::ArchType getArch();
-  llvm::SmallVector<Buffer> Buffers;
+
+  llvm::DenseMap<const clang::RecordType *, llvm::TargetExtType *> LayoutTypes;
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/CodeGen/CMakeLists.txt b/clang/lib/CodeGen/CMakeLists.txt
index 868ec847b9634..05ab6671453f8 100644
--- a/clang/lib/CodeGen/CMakeLists.txt
+++ b/clang/lib/CodeGen/CMakeLists.txt
@@ -106,6 +106,7 @@ add_clang_library(clangCodeGen
   ConstantInitBuilder.cpp
   CoverageMappingGen.cpp
   ItaniumCXXABI.cpp
+  HLSLBufferLayoutBuilder.cpp
   LinkInModulesPass.cpp
   MacroPPCallbacks.cpp
   MicrosoftCXXABI.cpp
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
new file mode 100644
index 0000000000000..1ae00023ab2bc
--- /dev/null
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.cpp
@@ -0,0 +1,229 @@
+//===- HLSLBufferLayoutBuilder.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "HLSLBufferLayoutBuilder.h"
+#include "CGHLSLRuntime.h"
+#include "CodeGenModule.h"
+#include "clang/AST/Type.h"
+
+//===----------------------------------------------------------------------===//
+// Implementation of constant buffer layout common between DirectX and
+// SPIR/SPIR-V.
+//===----------------------------------------------------------------------===//
+
+using namespace clang;
+using namespace clang::CodeGen;
+
+namespace {
+
+// Creates a new array type with the same dimentions but with the new
+// element type.
+static llvm::Type *
+createArrayWithNewElementType(CodeGenModule &CGM,
+                              const ConstantArrayType *ArrayType,
+                              llvm::Type *NewElemType) {
+  const clang::Type *ArrayElemType = ArrayType->getArrayElementTypeNoTypeQual();
+  if (ArrayElemType->isConstantArrayType())
+    NewElemType = createArrayWithNewElementType(
+        CGM, cast<const ConstantArrayType>(ArrayElemType), NewElemType);
+  return llvm::ArrayType::get(NewElemType, ArrayType->getSExtSize());
+}
+
+// Returns the size of a scalar or vector in bytes
+static unsigned getScalarOrVectorSizeInBytes(llvm::Type *Ty) {
+  assert(Ty->isVectorTy() || Ty->isIntegerTy() || Ty->isFloatingPointTy());
+  if (Ty->isVectorTy()) {
+    llvm::FixedVectorType *FVT = cast<llvm::FixedVectorType>(Ty);
+    return FVT->getNumElements() *
+           (FVT->getElementType()->getScalarSizeInBits() / 8);
+  }
+  return Ty->getScalarSizeInBits() / 8;
+}
+
+} // namespace
+
+namespace clang {
+namespace CodeGen {
+
+// Creates a layout type for given struct with HLSL constant buffer layout
+// taking into account Packoffsets, if provided.
+// Previously created layout types are cached by CGHLSLRuntime.
+//
+// The function iterates over all fields of the StructType (including base
+// classes) and calls layoutField to converts each field to its corresponding
+// LLVM type and to calculate its HLSL constant buffer layout. Any embedded
+// structs (or arrays of structs) are converted to target layout types as well.
+llvm::TargetExtType *HLSLBufferLayoutBuilder::createLayoutType(
+    const RecordType *StructType,
+    const llvm::SmallVector<unsigned> *Packoffsets) {
+
+  // check if we already have the layout type for this struct
+  if (llvm::TargetExtType *Ty =
+          CGM.getHLSLRuntime().getHLSLBufferLayoutType(StructType))
+    return Ty;
+
+  SmallVector<unsigned> Layout;
+  SmallVector<llvm::Type *> LayoutElements;
+  unsigned Index = 0; // packoffset index
+  unsigned EndOffset = 0;
+
+  // reserve first spot in the layout vector for buffer size
+  Layout.push_back(0);
+
+  // iterate over all fields of the record, including fields on base classes
+  llvm::SmallVector<const RecordType *> RecordTypes;
+  RecordTypes.push_back(StructType);
+  while (RecordTypes.back()->getAsCXXRecordDecl()->getNumBases()) {
+    CXXRecordDecl *D = RecordTypes.back()->getAsCXXRecordDecl();
+    assert(D->getNumBases() == 1 &&
+           "HLSL doesn't support multiple inheritance");
+    RecordTypes.push_back(D->bases_begin()->getType()->getAs<RecordType>());
+  }
+  while (!RecordTypes.empty()) {
+    const RecordType *RT = RecordTypes.back();
+    RecordTypes.pop_back();
+
+    for (const auto *FD : RT->getDecl()->fields()) {
+      assert(!Packoffsets || Index < Packoffsets->size() &&
+                                 "number of elements in layout struct does not "
+                                 "match number of packoffset annotations");
+
+      if (!layoutField(FD, EndOffset, Layout, LayoutElements,
+                       Packoffsets ? (*Packoffsets)[Index] : -1))
+        return nullptr;
+      Index++;
+    }
+  }
+
+  // set the size of the buffer
+  Layout[0] = EndOffset;
+
+  // create the layout struct type; anonymous struct have empty name but
+  // non-empty qualified name
+  const CXXRecordDecl *Decl = StructType->getAsCXXRecordDecl();
+  std::string Name =
+      Decl->getName().empty() ? "anon" : Decl->getQualifiedNameAsString();
+  llvm::StructType *StructTy =
+      llvm::StructType::create(LayoutElements, Name, true);
+
+  // create target layout type
+  llvm::TargetExtType *NewLayoutTy = llvm::TargetExtType::get(
+      CGM.getLLVMContext(), LayoutTypeName, {StructTy}, Layout);
+  if (NewLayoutTy)
+    CGM.getHLSLRuntime().addHLSLBufferLayoutType(StructType, NewLayoutTy);
+  return NewLayoutTy;
+}
+
+// The function converts a single field of HLSL Buffer to its corresponding
+// LLVM type and calculates it's layout. Any embedded structs (or
+// arrays of structs) are converted to target layout types as well.
+// The converted type is appended to the LayoutElements list, the element
+// offset is added to the Layout list and the EndOffset updated to the offset
+// just after the lay-ed out element (which is basically the size of the
+// buffer).
+// Returns true if the conversion was successful.
+// The packoffset parameter contains the field's layout offset provided by the
+// user or -1 if there was no packoffset (or register(cX)) annotation.
+bool HLSLBufferLayoutBuilder::layoutField(
+    const FieldDecl *FD, unsigned &EndOffset, SmallVector<unsigned> &Layout,
+    SmallVector<llvm::Type *> &LayoutElements, int Packoffset) {
+
+  // Size of element; for arrays this is a size of a single element in the
+  // array. Total array size of calculated as (ArrayCount-1) * ArrayStride +
+  // ElemSize.
+  unsigned ElemSize = 0;
+  unsigned ElemOffset = 0;
+  unsigned ArrayCount = 1;
+  unsigned ArrayStride = 0;
+
+  const unsigned BufferRowAlign = 16U;
+  unsigned NextRowOffset = llvm::alignTo(EndOffset, BufferRowAlign);
+
+  llvm::Type *ElemLayoutTy = nullptr;
+  QualType FieldTy = FD->getType();
+
+  if (FieldTy->isConstantArrayType()) {
+    // Unwrap array to find the element type and get combined array size.
+    QualType Ty = FieldTy;
+    while (Ty->isConstantArrayType()) {
+      const ConstantArrayType *ArrayTy = cast<ConstantArrayType>(Ty);
+      ArrayCount *= ArrayTy->getSExtSize();
+      Ty = ArrayTy->getElementType();
+    }
+    // For array of structures, create a new array with a layout type
+    // instead of the structure type.
+    if (Ty->isStructureType()) {
+      llvm::Type *NewTy =
+          cast<llvm::TargetExtType>(createLayoutType(Ty->getAsStructureType()));
+      if (!NewTy)
+        return false;
+      assert(isa<llvm::TargetExtType>(NewTy) && "expected target type");
+      ElemSize = cast<llvm::TargetExtType>(NewTy)->getIntParameter(0);
+      ElemLayoutTy = createArrayWithNewElementType(
+          CGM, cast<ConstantArrayType>(FieldTy.getTypePtr()), NewTy);
+    } else {
+      // Array of vectors or scalars
+      ElemSize =
+          getScalarOrVectorSizeInBytes(CGM.getTypes().ConvertTypeForMem(Ty));
+      ElemLayoutTy = CGM.getTypes().ConvertTypeForMem(FieldTy);
+    }
+    ArrayStride = llvm::alignTo(ElemSize, BufferRowAlign);
+    ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset;
+
+  } else if (FieldTy->isStructureType()) {
+    // Create a layout type for the structure
+    ElemLayoutTy = createLayoutType(FieldTy->getAsStructureType());
+    if (!ElemLayoutTy)
+      return false;
+    assert(isa<llvm::TargetExtType>(ElemLayoutTy) && "expected target type");
+    ElemSize = cast<llvm::TargetExtType>(ElemLayoutTy)->getIntParameter(0);
+    ElemOffset = (Packoffset != -1) ? Packoffset : NextRowOffset;
+
+  } else {
+    // scalar or vector - find element size and alignment
+    unsigned Align = 0;
+    ElemLayoutTy = CGM.getTypes().ConvertTypeForMem(FieldTy);
+    if (ElemLayoutTy->isVectorTy()) {
+      // align vectors by sub element size
+      const llvm::FixedVectorType *FVT =
+          cast<llvm::FixedVectorType>(ElemLayoutTy);
+      unsigned SubElemSize = FVT->getElementType()->getScalarSizeInBits() / 8;
+      ElemSize = FVT->getNumElements() * SubElemSize;
+      Align = SubElemSize;
+    } else {
+      assert(ElemLayoutTy->isIntegerTy() || ElemLayoutTy->isFloatingPointTy());
+      ElemSize = ElemLayoutTy->getScalarSizeInBits() / 8;
+      Align = ElemSize;
+    }
+
+    // calculate or get element offset for the vector or scalar
+    if (Packoffset != -1) {
+      ElemOffset = Packoffset;
+    } else {
+      ElemOffset = llvm::alignTo(EndOffset, Align);
+      // if the element does not fit, move it to the next row
+      if (ElemOffset + ElemSize > NextRowOffset)
+        ElemOffset = NextRowOffset;
+    }
+  }
+
+  // Update end offset of the layout; do not update it if the EndOffset
+  // is already bigger than the new value (which may happen with unordered
+  // packoffset annotations)
+  unsigned NewEndOffset =
+      ElemOffset + (ArrayCount - 1) * ArrayStride + ElemSize;
+  EndOffset = std::max<unsigned>(EndOffset, NewEndOffset);
+
+  // add the layout element and offset to the lists
+  Layout.push_back(ElemOffset);
+  LayoutElements.push_back(ElemLayoutTy);
+  return true;
+}
+
+} // namespace CodeGen
+} // namespace clang
diff --git a/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
new file mode 100644
index 0000000000000..57bb17c557b9c
--- /dev/null
+++ b/clang/lib/CodeGen/HLSLBufferLayoutBuilder.h
@@ -0,0 +1,48 @@
+//===- HLSLBufferLayoutBuilder.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/DerivedTypes.h"
+
+namespace clang {
+class RecordType;
+class FieldDecl;
+
+namespace CodeGen {
+class CodeGenModule;
+
+//===----------------------------------------------------------------------===//
+// Implementation of constant buffer layout common between DirectX and
+// SPIR/SPIR-V.
+//===----------------------------------------------------------------------===//
+
+class HLSLBufferLayoutBuilder {
+private:
+  CodeGenModule &CGM;
+  llvm::StringRef LayoutTypeName;
+
+public:
+  HLSLBufferLayoutBuilder(CodeGenModule &CGM, llvm::StringRef LayoutTypeName)
+      : CGM(CGM), LayoutTypeName(LayoutTypeName) {}
+
+  // Returns LLVM target extension type with the name LayoutTypeName
+  // for given structure type and layout data. The first number in
+  // the Layout is the size followed by offsets for each struct element.
+  llvm::TargetExtType *
+  createLayoutType(const RecordType *StructType,
+                   const llvm::SmallVector<unsigned> *Packoffsets = nullptr);
+
+private:
+  bool layoutField(const clang::FieldDecl *FD, unsigned &EndOffset,
+                   llvm::SmallVector<unsigned> &Layout,
+                   llvm::SmallVector<llvm::Type *> &LayoutElements,
+                   int Packoffset);
+};
+
+} // namespace CodeGen
+} // namespace clang
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 4a66683a3b91f..86057c14a549e 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -439,7 +439,9 @@ class TargetCodeGenInfo {
   }
 
   /// Return an LLVM type that corresponds to a HLSL type
-  virtual llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T) const {
+  virtual llvm::Type *
+  getHLSLType(CodeGenModule &CGM, const Type *T,
+              const SmallVector<unsigned> *Packoffsets = nullptr) const {
     return nullptr;
   }
 
diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp
index 7935f7ae37004..77091eb45f5cf 100644
--- a/clang/lib/CodeGen/Targets/DirectX.cpp
+++ b/clang/lib/CodeGen/Targets/DirectX.cpp
@@ -7,8 +7,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "ABIInfoImpl.h"
+#include "CodeGenModule.h"
+#include "HLSLBufferLayoutBuilder.h"
 #include "TargetInfo.h"
+#include "clang/AST/Type.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -24,11 +29,14 @@ class DirectXTargetCodeGenInfo : public TargetCodeGenInfo {
   DirectXTargetCodeGenInfo(CodeGen::CodeGenTypes &CGT)
       : TargetCodeGenInfo(std::make_unique<DefaultABIInfo>(CGT)) {}
 
-  llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *T) const override;
+  llvm::Type *getHLSLType(
+      CodeGenModule &CGM, const Type *T,
+      const SmallVector<unsigned> *Packoffsets = nullptr) const override;
 };
 
-llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(CodeGenModule &CGM,
-                                                  const Type *Ty) const {
+llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(
+    CodeGenModule &CGM, const Type *Ty,
+    const SmallVector<unsigned> *Packoffsets) const {
   auto *ResType = dyn_cast<HLSLAttributedResourceType>(Ty);
   if (!ResType)
     return nullptr;
@@ -56,9 +64,19 @@ llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(CodeGenModule &CGM,
 
     return llvm::TargetExtType::get(Ctx, TypeName, {ElemType}, Ints);
   }
-  case llvm::dxil::ResourceClass::CBuffer:
-    llvm_unreachable("dx.CBuffer handles are not implemented yet");
-    break;
+  case llvm::dxil::ResourceClass::CBuffer: {
+    QualType ContainedTy = ResType->getContainedType();
+    if (ContainedTy.isNull() || !ContainedTy->isStructureType())
+      return nullptr;
+
+    llvm::Type *BufferLayoutTy =
+        HLSLBufferLayoutBuilder(CGM, "dx.Layout")
+            .createLayoutType(ContainedTy->getAsStructureType(), Packoffsets);
+    if (!BufferLayoutTy)
+      return nullptr;
+
+    return llvm::TargetExtType::get(Ctx, "dx.CBuffer", {BufferLayoutTy});
+  }
   case llvm::dxil::ResourceClass::Sampler:
     llvm_unreachable("dx.Sampler handles are not implemented yet");
     break;
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index b81ed29a5159b..c94db31ae1a89 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -52,7 +52,9 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
 
   unsigned getOpenCLKernelCallingConv() const override;
   llvm::Type *getOpenCLType(CodeGenModule &CGM, const Type *T) const override;
-  llvm::Type *getHLSLType(CodeGenModule &CGM, const Type *Ty) const override;
+  llvm::Type *getHLSLType(
+      CodeGenModule &CGM, const Type *Ty,
+      const SmallVector<unsigned> *Packoffsets = nullptr) const override;
   llvm::Type *getSPIRVImageTypeFromHLSLResource(
       const HLSLAttributedResourceType::Attributes &attributes,
       llvm::Type *ElementType, llvm::LLVMContext &Ctx) const;
@@ -367,8 +369,9 @@ llvm::Type *CommonSPIRTargetCodeGenInfo::getOpenCLType(CodeGenModule &CGM,
   return nullptr;
 }
 
-llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType(CodeGenModule &CGM,
-                                                     const Type *Ty) const {
+llvm::Type *CommonSPIRTargetCodeGenInfo::getHLSLType(
+    CodeGenModule &CGM, const Type *Ty,
+    const SmallVector<unsigned> *Packoffsets) const {
   auto *ResType = dyn_cast<HLSLAttributedResourceType>(Ty);
   if (!ResType)
     return nullptr;
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 362df485a025c..d95763b22a819 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -14295,6 +14295,13 @@ void Sema::ActOnUninitializedDecl(Decl *RealDecl) {
     if (getLangOpts().OpenCL &&
         Var->getType().getAddressSpace() == LangAS::opencl_local)
       return;
+
+    // In HLSL, objects in the hlsl_constant address space are initialized
+    // externally, so don't synthesize an implicit initializer.
+    if (getLangOpts().HLSL &&
+        Var->getType().getAddressSpace() == LangAS::hlsl_constant)
+      return;
+
     // C++03 [dcl.init]p9:
     //   If no initializer is specified for an object, and the
     //   object is of (possibly cv-qualified) non-POD class type (or
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 20275ded8a561..d26d85d5861b1 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -176,9 +176,9 @@ Decl *SemaHLSL::ActOnStartBuffer(Scope *BufferScope, bool CBuffer,
 // https://learn.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-packing-rules
 static unsigned calculateLegacyCbufferSize(const ASTContext &Context,
                                            QualType T) {
-  unsigned Size = 0;
   constexpr unsigned CBufferAlign = 16;
   if (const RecordType *RT = T->getAs<RecordType>()) {
+    unsigned Size = 0;
     const RecordDecl *RD = RT->getDecl();
     for (const FieldDecl *Field : RD->fields()) {
       QualType Ty = Field->getType();
@@ -191,22 +191,28 @@ static unsigned calculateLegacyCbufferSize(const ASTContext &Context,
       Size = llvm::alignTo(Size, FieldAlign);
       Size += FieldSize;
     }
-  } else if (const ConstantArrayType *AT = Context.getAsConstantArrayType(T)) {
-    if (unsigned ElementCount = AT->getSize().getZExtValue()) {
-      unsigned ElementSize =
-          calculateLegacyCbufferSize(Context, AT->getElementType());
-      unsigned AlignedElementSize = llvm::alignTo(ElementSize, CBufferAlign);
-      Size = AlignedElementSize * (ElementCount - 1) + ElementSize;
-    }
-  } else if (const VectorType *VT = T->getAs<VectorType>()) {
+    return Size;
+  }
+
+  if (const ConstantArrayType *AT = Context.getAsConstantArrayType(T)) {
+    unsigned ElementCount = AT->getSize().getZExtValue();
+    if (ElementCount == 0)
+      return 0;
+
+    unsigned ElementSize =
+        calculateLegacyCbufferSize(Context, AT->getElementType());
+    unsigned AlignedElementSize = llvm::alignTo(ElementSize, CBufferAlign);
+    return AlignedElementSize * (ElementCount - 1) + ElementSize;
+  }
+
+  if (const VectorType *VT = T->getAs<VectorType>()) {
     unsigned ElementCount = VT->getNumElements();
     unsigned ElementSize =
         calculateLegacyCbufferSize(Context, VT->getElementType());
-    Size = ElementSize * ElementCount;
-  } else {
-    Size = Context.getTypeSize(T) / 8;
+    return ElementSize * ElementCount;
   }
-  return Size;
+
+  return Context.getTypeSize(T) / 8;
 }
 
 // Validate packoffset:
@@ -239,6 +245,7 @@ static void validatePackoffset(Sema &S, HLSLBufferDecl *BufDecl) {
 
   // Make sure there is no overlap in packoffset - sort PackOffsetVec by offset
   // and compare adjacent values.
+  bool IsValid = true;
   ASTContext &Context = S.getASTContext();
   std::sort(PackOffsetVec.begin(), PackOffsetVec.end(),
             [](const std::pair<VarDecl *, HLSLPackOffsetAttr *> &LHS,
@@ -257,8 +264,10 @@ static void validatePackoffset(Sema &S, HLSLBufferDecl *BufDecl) {
       VarDecl *NextVar = PackOffsetVec[i + 1].first;
       S.Diag(NextVar->getLocation(), diag::err_hlsl_packoffset_overlap)
           << NextVar << Var;
+      IsValid = false;
     }
   }
+  BufDecl->setHasValidPackoffset(IsValid);
 }
 
 // Returns true if the array has a zero size = if any of the dimensions is 0
@@ -500,7 +509,7 @@ void createHostLayoutStructForBuffer(Sema &S, HLSLBufferDecl *BufDecl) {
     }
   }
   LS->completeDefinition();
-  BufDecl->addDecl(LS);
+  BufDecl->addLayoutStruct(LS);
 }
 
 // Handle end of cbuffer/tbuffer declaration
diff --git a/clang/test/CodeGen/aapcs-align.cpp b/clang/test/CodeGen/aapcs-align.cpp
index 4f393d9e6b7f3..c7bc5ba0bbfef 100644
--- a/clang/test/CodeGen/aapcs-align.cpp
+++ b/clang/test/CodeGen/aapcs-align.cpp
@@ -6,6 +6,11 @@
 
 extern "C" {
 
+// CHECK: @sizeof_OverSizedBitfield ={{.*}} global i32 8
+// CHECK: @alignof_OverSizedBitfield ={{.*}} global i32 8
+// CHECK: @sizeof_VeryOverSizedBitfield ={{.*}} global i32 16
+// CHECK: @alignof_VeryOverSizedBitfield ={{.*}} global i32 8
+
 // Base case, nothing interesting.
 struct S {
   int x, y;
@@ -138,4 +143,42 @@ void g6() {
 // CHECK: call void @f6m(i32 noundef 1, i32 noundef 2, i32 noundef 3, i32 noundef 4, i32 noundef 5, [4 x i32] [i32 6, i32 7, i32 0, i32 undef])
 // CHECK: declare void @f6(i32 noundef, [4 x i32])
 // CHECK: declare void @f6m(i32 noundef, i32 noundef, i32 noundef, i32 noundef, i32 noundef, [4 x i32])
+
+// Over-sized bitfield, which results in a 64-bit container type, so 64-bit
+// alignment.
+struct OverSizedBitfield {
+  int x : 64;
+};
+
+unsigned sizeof_OverSizedBitfield = sizeof(OverSizedBitfield);
+unsigned alignof_OverSizedBitfield = alignof(OverSizedBitfield);
+
+// CHECK: define{{.*}} void @g7
+// CHECK: call void @f7(i32 noundef 1, [1 x i64] [i64 42])
+// CHECK: declare void @f7(i32 noundef, [1 x i64])
+void f7(int a, OverSizedBitfield b);
+void g7() {
+  OverSizedBitfield s = {42};
+  f7(1, s);
+}
+
+// There are no 128-bit fundamental data types defined by AAPCS32, so this gets
+// a 64-bit container plus 64 bits of padding, giving it a size of 16 bytes and
+// alignment of 8 bytes.
+struct VeryOverSizedBitfield {
+  int x : 128;
+};
+
+unsigned sizeof_VeryOverSizedBitfield = sizeof(VeryOverSizedBitfield);
+unsigned alignof_VeryOverSizedBitfield = alignof(VeryOverSizedBitfield);
+
+// CHECK: define{{.*}} void @g8
+// CHECK: call void @f8(i32 noundef 1, [2 x i64] [i64 42, i64 0])
+// CHECK: declare void @f8(i32 noundef, [2 x i64])
+void f8(int a, VeryOverSizedBitfield b);
+void g8() {
+  VeryOverSizedBitfield s = {42};
+  f8(1, s);
+}
+
 }
diff --git a/clang/test/CodeGen/aapcs64-align.cpp b/clang/test/CodeGen/aapcs64-align.cpp
index 7a8151022852e..e69faf231936c 100644
--- a/clang/test/CodeGen/aapcs64-align.cpp
+++ b/clang/test/CodeGen/aapcs64-align.cpp
@@ -5,6 +5,13 @@
 
 extern "C" {
 
+// CHECK: @sizeof_OverSizedBitfield ={{.*}} global i32 8
+// CHECK: @alignof_OverSizedBitfield ={{.*}} global i32 8
+// CHECK: @sizeof_VeryOverSizedBitfield ={{.*}} global i32 16
+// CHECK: @alignof_VeryOverSizedBitfield ={{.*}} global i32 16
+// CHECK: @sizeof_RidiculouslyOverSizedBitfield ={{.*}} global i32 32
+// CHECK: @alignof_RidiculouslyOverSizedBitfield ={{.*}} global i32 16
+
 // Base case, nothing interesting.
 struct S {
   long x, y;
@@ -161,5 +168,62 @@ int test_bitint8(){
 }
 // CHECK:  ret i32 1
 
+// Over-sized bitfield, which results in a 64-bit container type, so 64-bit
+// alignment.
+struct OverSizedBitfield {
+  int x : 64;
+};
+
+unsigned sizeof_OverSizedBitfield = sizeof(OverSizedBitfield);
+unsigned alignof_OverSizedBitfield = alignof(OverSizedBitfield);
+
+// CHECK: define{{.*}} void @g7
+// CHECK: call void @f7(i32 noundef 1, i64 42)
+// CHECK: declare void @f7(i32 noundef, i64)
+void f7(int a, OverSizedBitfield b);
+void g7() {
+  OverSizedBitfield s = {42};
+  f7(1, s);
+}
+
+// AAPCS64 does have a 128-bit integer fundamental data type, so this gets a
+// 128-bit container with 128-bit alignment. This is just within the limit of
+// what can be passed directly.
+struct VeryOverSizedBitfield {
+  int x : 128;
+};
+
+unsigned sizeof_VeryOverSizedBitfield = sizeof(VeryOverSizedBitfield);
+unsigned alignof_VeryOverSizedBitfield = alignof(VeryOverSizedBitfield);
+
+// CHECK: define{{.*}} void @g8
+// CHECK: call void @f8(i32 noundef 1, i128 42)
+// CHECK: declare void @f8(i32 noundef, i128)
+void f8(int a, VeryOverSizedBitfield b);
+void g8() {
+  VeryOverSizedBitfield s = {42};
+  f8(1, s);
+}
+
+// There are no bigger fundamental data types, so this gets a 128-bit container
+// and 128 bits of padding, giving the struct a size of 32 bytes, and an
+// alignment of 16 bytes. This is over the PCS size limit of 16 bytes, so it
+// will be passed indirectly.
+struct RidiculouslyOverSizedBitfield {
+  int x : 256;
+};
+
+unsigned sizeof_RidiculouslyOverSizedBitfield = sizeof(RidiculouslyOverSizedBitfield);
+unsigned alignof_RidiculouslyOverSizedBitfield = alignof(RidiculouslyOverSizedBitfield);
+
+// CHECK: define{{.*}} void @g9
+// CHECK: call void @f9(i32 noundef 1, ptr noundef nonnull %agg.tmp)
+// CHECK: declare void @f9(i32 noundef, ptr noundef)
+void f9(int a, RidiculouslyOverSizedBitfield b);
+void g9() {
+  RidiculouslyOverSizedBitfield s = {42};
+  f9(1, s);
+}
+
 }
 
diff --git a/clang/test/CodeGen/armv7k-abi.c b/clang/test/CodeGen/armv7k-abi.c
index fd18dafa7d03f..872e6423a4a99 100644
--- a/clang/test/CodeGen/armv7k-abi.c
+++ b/clang/test/CodeGen/armv7k-abi.c
@@ -16,7 +16,7 @@ typedef struct {
 void simple_hfa(HFA h) {}
 
 // CHECK: define{{.*}} %struct.HFA @return_simple_hfa
-HFA return_simple_hfa() {}
+HFA return_simple_hfa() { return (HFA){0}; }
 
 typedef struct {
   double arr[4];
@@ -43,7 +43,7 @@ typedef struct {
 void big_struct_indirect(BigStruct b) {}
 
 // CHECK: define{{.*}} void @return_big_struct_indirect(ptr dead_on_unwind noalias writable sret
-BigStruct return_big_struct_indirect() {}
+BigStruct return_big_struct_indirect() { return (BigStruct){0}; }
 
 // Structs smaller than 16 bytes should be passed directly, and coerced to
 // either [N x i32] or [N x i64] depending on alignment requirements.
@@ -58,7 +58,7 @@ typedef struct {
 void small_struct_direct(SmallStruct s) {}
 
 // CHECK: define{{.*}} [4 x i32] @return_small_struct_direct()
-SmallStruct return_small_struct_direct() {}
+SmallStruct return_small_struct_direct() { return (SmallStruct){0}; }
 
 typedef struct {
   float x;
@@ -75,14 +75,14 @@ typedef struct {
 } PaddedSmallStruct;
 
 // CHECK: define{{.*}} i32 @return_padded_small_struct()
-PaddedSmallStruct return_padded_small_struct() {}
+PaddedSmallStruct return_padded_small_struct() { return (PaddedSmallStruct){0}; }
 
 typedef struct {
   char arr[7];
 } OddlySizedStruct;
 
 // CHECK: define{{.*}} [2 x i32] @return_oddly_sized_struct()
-OddlySizedStruct return_oddly_sized_struct() {}
+OddlySizedStruct return_oddly_sized_struct() { return (OddlySizedStruct){0}; }
 
 // CHECK: define{{.*}} <4 x float> @test_va_arg_vec(ptr noundef %l)
 
diff --git a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
index e475f032f5ce3..b7aad6a5bcd21 100644
--- a/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
+++ b/clang/test/CodeGenCXX/debug-info-structured-binding-bitfield.cpp
@@ -248,8 +248,8 @@ struct S15 {
 };
 
 // CHECK-LABEL: define dso_local void @_Z4fS15v
-// CHECK:                        alloca %struct.S15, align 8
-// CHECK-NEXT:    [[TMP0:%.*]] = alloca %struct.S15, align 8
+// CHECK:                        alloca %struct.S15, align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = alloca %struct.S15, align 16
 // CHECK:         #dbg_declare(ptr [[TMP0]], [[S15_A:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 32),
 // CHECK-NEXT:    #dbg_declare(ptr [[TMP0]], [[S15_B:![0-9]+]], !DIExpression(DW_OP_plus_uconst, 16, DW_OP_LLVM_extract_bits_zext, 0, 32),
 //
diff --git a/clang/test/CodeGenHLSL/cbuf.hlsl b/clang/test/CodeGenHLSL/cbuf.hlsl
deleted file mode 100644
index 825e7b8161a60..0000000000000
--- a/clang/test/CodeGenHLSL/cbuf.hlsl
+++ /dev/null
@@ -1,33 +0,0 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-// CHECK: @a = external addrspace(2) externally_initialized global float, align 4
-// CHECK: @b = external addrspace(2) externally_initialized global double, align 8
-// CHECK: @c = external addrspace(2) externally_initialized global float, align 4
-// CHECK: @d = external addrspace(2) externally_initialized global double, align 8
-
-// CHECK: @[[CB:.+]] = external constant { float, double }
-cbuffer A : register(b0, space2) {
-  float a;
-  double b;
-}
-
-// CHECK: @[[TB:.+]] = external constant { float, double }
-tbuffer A : register(t2, space1) {
-  float c;
-  double d;
-}
-
-float foo() {
-// CHECK: load float, ptr addrspace(2) @a, align 4
-// CHECK: load double, ptr addrspace(2) @b, align 8
-// CHECK: load float, ptr addrspace(2) @c, align 4
-// CHECK: load double, ptr addrspace(2) @d, align 8
-  return a + b + c*d;
-}
-
-// CHECK: !hlsl.cbufs = !{![[CBMD:[0-9]+]]}
-// CHECK: ![[CBMD]] = !{ptr @[[CB]], i32 13, i32 0, i1 false, i32 0, i32 2}
diff --git a/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl b/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl
deleted file mode 100644
index 13c401d428331..0000000000000
--- a/clang/test/CodeGenHLSL/cbuf_in_namespace.hlsl
+++ /dev/null
@@ -1,29 +0,0 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-// Make sure cbuffer inside namespace works.
-
-// CHECK: @_ZN2n02n11aE = external addrspace(2) externally_initialized global float, align 4
-// CHECK: @_ZN2n01bE = external addrspace(2) externally_initialized global float, align 4
-
-// CHECK: @[[CB:.+]] = external constant { float }
-// CHECK: @[[TB:.+]] = external constant { float }
-namespace n0 {
-namespace n1 {
-  cbuffer A {
-    float a;
-  }
-}
-  tbuffer B {
-    float b;
-  }
-}
-
-float foo() {
-// CHECK: load float, ptr addrspace(2) @_ZN2n02n11aE, align 4
-// CHECK: load float, ptr addrspace(2) @_ZN2n01bE, align 4
-  return n0::n1::a + n0::b;
-}
diff --git a/clang/test/CodeGenHLSL/cbuffer.hlsl b/clang/test/CodeGenHLSL/cbuffer.hlsl
new file mode 100644
index 0000000000000..38093c6dfacd7
--- /dev/null
+++ b/clang/test/CodeGenHLSL/cbuffer.hlsl
@@ -0,0 +1,197 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-compute \
+// RUN:   -fnative-half-type -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+// CHECK: %__cblayout_CBScalars = type <{ float, double, half, i64, i32, i16, i32, i64 }>
+// CHECK: %__cblayout_CBVectors = type <{ <3 x float>, <3 x double>, <2 x half>, <3 x i64>, <4 x i32>, <3 x i16>, <3 x i64> }>
+// CHECK: %__cblayout_CBArrays = type <{ [3 x float], [2 x <3 x double>], [2 x [2 x half]], [3 x i64], [2 x [3 x [4 x <4 x i32>]]], [1 x i16], [2 x i64], [4 x i32] }>
+// CHECK: %__cblayout_CBStructs = type <{ target("dx.Layout", %A, 8, 0), target("dx.Layout", %B, 14, 0, 8), 
+// CHECK-SAME: target("dx.Layout", %C, 24, 0, 16), [5 x target("dx.Layout", %A, 8, 0)], 
+// CHECK-SAME: target("dx.Layout", %__cblayout_D, 94, 0), half, <3 x i16> }>
+
+// CHECK: %A = type <{ <2 x float> }>
+// CHECK: %B = type <{ <2 x float>, <3 x i16> }>
+// CHECK: %C = type <{ i32, target("dx.Layout", %A, 8, 0) }>
+// CHECK: %__cblayout_D = type <{ [2 x [3 x target("dx.Layout", %B, 14, 0, 8)]] }>
+
+// CHECK: %__cblayout_CBMix = type <{ [2 x target("dx.Layout", %Test, 8, 0, 4)], float, [3 x [2 x <2 x float>]], float,
+// CHECK-SAME: target("dx.Layout", %anon, 4, 0), double, target("dx.Layout", %anon.0, 8, 0), float, <1 x double>, i16 }>
+
+// CHECK: %Test = type <{ float, float }>
+// CHECK: %anon = type <{ float }>
+// CHECK: %anon.0 = type <{ <2 x i32> }>
+
+cbuffer CBScalars : register(b1, space5) {
+  float a1;
+  double a2;
+  float16_t a3;
+  uint64_t a4;
+  int a5;
+  uint16_t a6;
+  bool a7;
+  int64_t a8;
+}
+
+// CHECK: @CBScalars.cb = external constant target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars, 
+// CHECK-SAME: 56, 0, 8, 16, 24, 32, 36, 40, 48))
+// CHECK: @a1 = external addrspace(2) global float, align 4
+// CHECK: @a2 = external addrspace(2) global double, align 8
+// CHECK: @a3 = external addrspace(2) global half, align 2
+// CHECK: @a4 = external addrspace(2) global i64, align 8
+// CHECK: @a5 = external addrspace(2) global i32, align 4
+// CHECK: @a6 = external addrspace(2) global i16, align 2
+// CHECK: @a7 = external addrspace(2) global i32, align 4
+// CHECK: @a8 = external addrspace(2) global i64, align 8
+
+cbuffer CBVectors {
+  float3 b1;
+  double3 b2;
+  float16_t2 b3;
+  uint64_t3 b4;
+  int4 b5;
+  uint16_t3 b6;
+  int64_t3 b7;
+  // FIXME: add a bool vectors after llvm-project/llvm#91639 is added
+}
+
+// CHECK: @CBVectors.cb = external constant target("dx.CBuffer", target("dx.Layout", %__cblayout_CBVectors, 
+// CHECK-SAME: 136, 0, 16, 40, 48, 80, 96, 112))
+// CHECK: @b1 = external addrspace(2) global <3 x float>, align 16
+// CHECK: @b2 = external addrspace(2) global <3 x double>, align 32
+// CHECK: @b3 = external addrspace(2) global <2 x half>, align 4
+// CHECK: @b4 = external addrspace(2) global <3 x i64>, align 32
+// CHECK: @b5 = external addrspace(2) global <4 x i32>, align 16
+// CHECK: @b6 = external addrspace(2) global <3 x i16>, align 8
+// CHECK: @b7 = external addrspace(2) global <3 x i64>, align 32
+
+cbuffer CBArrays : register(b2) {
+  float c1[3];
+  double3 c2[2];
+  float16_t c3[2][2];
+  uint64_t c4[3];
+  int4 c5[2][3][4];
+  uint16_t c6[1];
+  int64_t c7[2];
+  bool c8[4];
+}
+
+// CHECK: @CBArrays.cb = external constant target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays, 
+// CHECK-SAME: 708, 0, 48, 112, 176, 224, 608, 624, 656))
+// CHECK: @c1 = external addrspace(2) global [3 x float], align 4
+// CHECK: @c2 = external addrspace(2) global [2 x <3 x double>], align 32
+// CHECK: @c3 = external addrspace(2) global [2 x [2 x half]], align 2
+// CHECK: @c4 = external addrspace(2) global [3 x i64], align 8
+// CHECK: @c5 = external addrspace(2) global [2 x [3 x [4 x <4 x i32>]]], align 16
+// CHECK: @c6 = external addrspace(2) global [1 x i16], align 2
+// CHECK: @c7 = external addrspace(2) global [2 x i64], align 8
+// CHECK: @c8 = external addrspace(2) global [4 x i32], align 4
+
+struct Empty {};
+
+struct A {
+  float2 f1;
+};
+
+struct B : A {
+  uint16_t3 f2;
+};
+
+struct C {
+  int i;
+  A f3;
+};
+
+struct D {
+  B array_of_B[2][3];
+  Empty es;
+};
+
+// CHECK: @CBStructs.cb = external constant target("dx.CBuffer", target("dx.Layout", %__cblayout_CBStructs, 
+// CHECK-SAME: 246, 0, 16, 32, 64, 144, 238, 240))
+// CHECK: @a = external addrspace(2) global target("dx.Layout", %A, 8, 0), align 8
+// CHECK: @b = external addrspace(2) global target("dx.Layout", %B, 14, 0, 8), align 8
+// CHECK: @c = external addrspace(2) global target("dx.Layout", %C, 24, 0, 16), align 8
+// CHECK: @array_of_A = external addrspace(2) global [5 x target("dx.Layout", %A, 8, 0)], align 8
+// CHECK: @d = external addrspace(2) global target("dx.Layout", %__cblayout_D, 94, 0), align 8
+// CHECK: @e = external addrspace(2) global half, align 2
+// CHECK: @f = external addrspace(2) global <3 x i16>, align 8
+
+cbuffer CBStructs {
+  A a;
+  B b;
+  C c;
+  A array_of_A[5];
+  D d;
+  half e;
+  uint16_t3 f;
+};
+
+struct Test {
+    float a, b;
+};
+
+// CHECK: @CBMix.cb = external constant target("dx.CBuffer", target("dx.Layout", %__cblayout_CBMix,
+// CHECK-SAME: 170, 0, 24, 32, 120, 128, 136, 144, 152, 160, 168))
+// CHECK: @test = external addrspace(2) global [2 x target("dx.Layout", %Test, 8, 0, 4)], align 4
+// CHECK: @f1 = external addrspace(2) global float, align 4
+// CHECK: @f2 = external addrspace(2) global [3 x [2 x <2 x float>]], align 8
+// CHECK: @f3 = external addrspace(2) global float, align 4
+// CHECK: @f4 = external addrspace(2) global target("dx.Layout", %anon, 4, 0), align 4
+// CHECK: @f5 = external addrspace(2) global double, align 8
+// CHECK: @f6 = external addrspace(2) global target("dx.Layout", %anon.0, 8, 0), align 8
+// CHECK: @f7 = external addrspace(2) global float, align 4
+// CHECK: @f8 = external addrspace(2) global <1 x double>, align 8
+// CHECK: @f9 = external addrspace(2) global i16, align 2
+
+cbuffer CBMix {
+    Test test[2];
+    float f1;
+    float2 f2[3][2];
+    float f3;
+    struct { float c; } f4;
+    double f5;
+    struct { int2 i; } f6;
+    float f7;
+    vector<double,1> f8;
+    uint16_t f9;
+};  
+
+// CHECK: define internal void @_init_resource_CBScalars.cb()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %[[HANDLE1:.*]] = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars, 56, 0, 8, 16, 24, 32, 36, 40, 48))
+// CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBScalarss_56_0_8_16_24_32_36_40_48tt(i32 5, i32 1, i32 1, i32 0, i1 false)
+// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBScalars, 56, 0, 8, 16, 24, 32, 36, 40, 48)) %CBScalars.cb_h, ptr @CBScalars.cb, align 4
+
+// CHECK: define internal void @_init_resource_CBArrays.cb()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %[[HANDLE2:.*]] = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays, 708, 0, 48, 112, 176, 224, 608, 624, 656))
+// CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBArrayss_708_0_48_112_176_224_608_624_656tt(i32 0, i32 2, i32 1, i32 0, i1 false)
+// CHECK-NEXT: store target("dx.CBuffer", target("dx.Layout", %__cblayout_CBArrays, 708, 0, 48, 112, 176, 224, 608, 624, 656)) %CBArrays.cb_h, ptr @CBArrays.cb, align 4
+
+RWBuffer<float> Buf;
+
+[numthreads(4,1,1)]
+void main() {
+  Buf[0] = a1 + b1.z + c1[2] + a.f1.y + f1;
+}
+
+// CHECK: define internal void @_GLOBAL__sub_I_cbuffer.hlsl()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_init_resource_CBScalars.cb()
+// CHECK-NEXT: call void @_init_resource_CBArrays.cb()
+
+// CHECK: !hlsl.cbs = !{![[CBSCALARS:[0-9]+]], ![[CBVECTORS:[0-9]+]], ![[CBARRAYS:[0-9]+]], ![[CBSTRUCTS:[0-9]+]], ![[CBMIX:[0-9]+]]}
+
+// CHECK: ![[CBSCALARS]] = !{ptr @CBScalars.cb, ptr addrspace(2) @a1, ptr addrspace(2) @a2, ptr addrspace(2) @a3, ptr addrspace(2) @a4,
+// CHECK-SAME: ptr addrspace(2) @a5, ptr addrspace(2) @a6, ptr addrspace(2) @a7, ptr addrspace(2) @a8}
+
+// CHECK: ![[CBVECTORS]] = !{ptr @CBVectors.cb, ptr addrspace(2) @b1, ptr addrspace(2) @b2, ptr addrspace(2) @b3, ptr addrspace(2) @b4,
+// CHECK-SAME: ptr addrspace(2) @b5, ptr addrspace(2) @b6, ptr addrspace(2) @b7}
+
+// CHECK: ![[CBARRAYS]] = !{ptr @CBArrays.cb, ptr addrspace(2) @c1, ptr addrspace(2) @c2, ptr addrspace(2) @c3, ptr addrspace(2) @c4, 
+// CHECK-SAME: ptr addrspace(2) @c5, ptr addrspace(2) @c6, ptr addrspace(2) @c7, ptr addrspace(2) @c8}
+
+// CHECK: ![[CBSTRUCTS]] = !{ptr @CBStructs.cb, ptr addrspace(2) @a, ptr addrspace(2) @b, ptr addrspace(2) @c, ptr addrspace(2) @array_of_A, 
+// CHECK-SAME: ptr addrspace(2) @d, ptr addrspace(2) @e, ptr addrspace(2) @f}
+
+// CHECK: ![[CBMIX]] = !{ptr @CBMix.cb, ptr addrspace(2) @test, ptr addrspace(2) @f1, ptr addrspace(2) @f2, ptr addrspace(2) @f3,
+// CHECK-SAME: ptr addrspace(2) @f4, ptr addrspace(2) @f5, ptr addrspace(2) @f6, ptr addrspace(2) @f7, ptr addrspace(2) @f8, ptr addrspace(2) @f9}
diff --git a/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
new file mode 100644
index 0000000000000..393ca3825c638
--- /dev/null
+++ b/clang/test/CodeGenHLSL/cbuffer_and_namespaces.hlsl
@@ -0,0 +1,56 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Make sure cbuffer inside namespace works.
+
+// CHECK: %"n0::n1::__cblayout_A" = type <{ float }>
+// CHECK: %"n0::__cblayout_B" = type <{ float }>
+// CHECK: %"n0::n2::__cblayout_C" = type <{ float, target("dx.Layout", %"n0::Foo", 4, 0) }>
+// CHECK: %"n0::Foo" = type <{ float }>
+
+// CHECK: @A.cb = external constant target("dx.CBuffer", target("dx.Layout", %"n0::n1::__cblayout_A", 4, 0))
+// CHECK: @_ZN2n02n11aE = external addrspace(2) global float, align 4
+
+// CHECK: @B.cb = external constant target("dx.CBuffer", target("dx.Layout", %"n0::__cblayout_B", 4, 0))
+// CHECK: @_ZN2n01aE = external addrspace(2) global float, align 4
+
+// CHECK: @C.cb = external constant target("dx.CBuffer", target("dx.Layout", %"n0::n2::__cblayout_C", 20, 0, 16))
+// CHECK: @_ZN2n02n21aE = external addrspace(2) global float, align 4
+// CHECK: external addrspace(2) global target("dx.Layout", %"n0::Foo", 4, 0), align 4
+
+namespace n0 {
+  struct Foo {
+    float f;
+  };
+
+  namespace n1 {
+    cbuffer A {
+      float a;
+    }
+  }
+  cbuffer B {
+    float a;
+  }
+  namespace n2 {
+    cbuffer C {
+      float a;
+      Foo b;
+    }
+  }
+}
+
+float foo() {
+  // CHECK: load float, ptr addrspace(2) @_ZN2n02n11aE, align 4
+  // CHECK: load float, ptr addrspace(2) @_ZN2n01aE, align 4
+  // CHECK: load float, ptr addrspace(2) @_ZN2n02n21aE, align 4
+  return n0::n1::a + n0::a + n0::n2::a;
+}
+
+[numthreads(4,1,1)]
+void main() {}
+
+// CHECK: !hlsl.cbs = !{![[A:[0-9]+]], ![[B:[0-9]+]], ![[C:[0-9]+]]}
+// CHECK: [[A]] = !{ptr @A.cb, ptr addrspace(2) @_ZN2n02n11aE}
+// CHECK: [[B]] = !{ptr @B.cb, ptr addrspace(2) @_ZN2n01aE}
+// CHECK: [[C]] = !{ptr @C.cb, ptr addrspace(2) @_ZN2n02n21aE, ptr addrspace(2) @_ZN2n02n21bE}
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
new file mode 100644
index 0000000000000..870593986a976
--- /dev/null
+++ b/clang/test/CodeGenHLSL/cbuffer_with_packoffset.hlsl
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK: %__cblayout_CB = type <{ float, double, <2 x i32> }>
+
+// CHECK: @CB.cb = external constant target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 176, 16, 168, 88))
+// CHECK: @a = external addrspace(2) global float, align 4
+// CHECK: @b = external addrspace(2) global double, align 8
+// CHECK: @c = external addrspace(2) global <2 x i32>, align 8
+
+cbuffer CB : register(b1, space3) {
+  float a : packoffset(c1.x);
+  double b : packoffset(c10.z);
+  int2 c : packoffset(c5.z);
+}
+
+// CHECK: define internal void @_init_resource_CB.cb()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: %CB.cb_h = call target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 176, 16, 168, 88))
+// CHECK-SAME: @llvm.dx.resource.handlefrombinding.tdx.CBuffer_tdx.Layout_s___cblayout_CBs_176_16_168_88tt(i32 3, i32 1, i32 1, i32 0, i1 false)
+
+float foo() {
+  // CHECK: load float, ptr addrspace(2) @a, align 4
+  // CHECK: load double, ptr addrspace(2) @b, align 8
+  return a + b;
+}
+// CHECK: define internal void @_GLOBAL__sub_I_cbuffer_with_packoffset.hlsl()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @_init_resource_CB.cb()
+
+[numthreads(4,1,1)]
+void main() {
+  foo();
+}
+
+// CHECK: !hlsl.cbs = !{![[CB:[0-9]+]]}
+// CHECK: ![[CB]] = !{ptr @CB.cb, ptr addrspace(2) @a, ptr addrspace(2) @b, ptr addrspace(2) @c}
diff --git a/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
new file mode 100644
index 0000000000000..99f40d8fc93d7
--- /dev/null
+++ b/clang/test/CodeGenHLSL/cbuffer_with_static_global_and_function.hlsl
@@ -0,0 +1,29 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// CHECK: %__cblayout_A = type <{ float }>
+
+// CHECK: @A.cb = external constant target("dx.CBuffer", target("dx.Layout", %__cblayout_A, 4, 0))
+// CHECK: @a = external addrspace(2) global float, align 4
+// CHECK-DAG: @_ZL1b = internal global float 3.000000e+00, align 4
+// CHECK-NOT: @B.cb
+
+cbuffer A {
+  float a;
+  static float b = 3;
+  float foo() { return a + b; }
+}
+
+cbuffer B {
+  // intentionally empty
+}
+
+// CHECK: define {{.*}} float @_Z3foov() #0 {
+// CHECK: load float, ptr addrspace(2) @a, align 4
+
+extern float bar() {
+  return foo();
+}
+
+// CHECK: !hlsl.cbs = !{![[CB:[0-9]+]]}
+// CHECK: ![[CB]] = !{ptr @A.cb, ptr addrspace(2) @a}
diff --git a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl b/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
deleted file mode 100644
index 25f51cce2017d..0000000000000
--- a/clang/test/CodeGenHLSL/static_global_and_function_in_cb.hlsl
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.3-library %s \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-// RUN: %clang_cc1 -finclude-default-header -triple spirv-pc-vulkan-library %s \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s
-
-cbuffer A {
-  // CHECK: @a = external addrspace(2) externally_initialized global float, align 4
-  float a;
-  // CHECK: @_ZL1b = internal global float 3.000000e+00, align 4
-  static float b = 3;
-  float foo() { return a + b; }
-}
-// CHECK: @[[CB:.+]] = external constant { float }
-
-// CHECK:define {{.*}} float @_Z3foov()
-// CHECK:load float, ptr addrspace(2) @a, align 4
-// CHECK:load float, ptr @_ZL1b, align 4
-
-float bar() {
-  return foo();
-}
diff --git a/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
index 72027eda4571d..59bf87b554af3 100644
--- a/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
+++ b/clang/test/CodeGenOpenCL/atomics-cas-remarks-gfx90a.cl
@@ -36,7 +36,7 @@ typedef enum memory_scope {
 // GFX90A-CAS: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("agent-one-as") monotonic
 // GFX90A-CAS: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("one-as") monotonic
 // GFX90A-CAS: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("wavefront-one-as") monotonic
-float atomic_cas(__global atomic_float *d, float a) {
+void atomic_cas(__global atomic_float *d, float a) {
   float ret1 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
   float ret2 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_device);
   float ret3 = __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_all_svm_devices);
diff --git a/clang/test/Index/comment-to-html-xml-conversion.cpp b/clang/test/Index/comment-to-html-xml-conversion.cpp
index e0a7cff5a9a3d..941aa8a27b6bf 100644
--- a/clang/test/Index/comment-to-html-xml-conversion.cpp
+++ b/clang/test/Index/comment-to-html-xml-conversion.cpp
@@ -20,7 +20,6 @@
 // RUN: FileCheck %s < %t/out.c-index-direct
 // RUN: FileCheck %s < %t/out.c-index-pch
 
-// XFAIL: msan
 // XFAIL: valgrind
 
 #ifndef HEADER
diff --git a/compiler-rt/test/hwasan/TestCases/libc_thread_freeres.c b/compiler-rt/test/hwasan/TestCases/libc_thread_freeres.c
index e6d1731f30e37..e95f4c8c4fd43 100644
--- a/compiler-rt/test/hwasan/TestCases/libc_thread_freeres.c
+++ b/compiler-rt/test/hwasan/TestCases/libc_thread_freeres.c
@@ -11,6 +11,7 @@ void *ThreadFn(void *) {
   __hwasan_enable_allocator_tagging();
   // This will trigger memory deallocation in __strerror_thread_freeres,
   // at a point when HwasanThread is already gone.
+  return NULL;
 }
 
 int main() {
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/diag-stacktrace.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/diag-stacktrace.cpp
index 8b7cb6ade35ac..296171848255f 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/diag-stacktrace.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/diag-stacktrace.cpp
@@ -2,10 +2,10 @@
 // UNSUPPORTED: target=thumb{{.*}}
 // UNSUPPORTED: android
 
-// RUN: %clangxx -fsanitize=return %gmlt -O2 -fno-omit-frame-pointer -fasynchronous-unwind-tables %s -o %t
+// RUN: %clangxx -Wno-error=return-type -fsanitize=return %gmlt -O2 -fno-omit-frame-pointer -fasynchronous-unwind-tables %s -o %t
 // RUN: %env_ubsan_opts=print_stacktrace=1:fast_unwind_on_fatal=0 not %run %t 2>&1 | FileCheck %s
 // RUN: %env_ubsan_opts=print_stacktrace=1:fast_unwind_on_fatal=1 not %run %t 2>&1 | FileCheck %s
-// RUN: %clangxx -fsanitize=return %gmlt -O2 -fno-omit-frame-pointer -fno-exceptions -fno-asynchronous-unwind-tables %s -o %t
+// RUN: %clangxx -Wno-error=return-type -fsanitize=return %gmlt -O2 -fno-omit-frame-pointer -fno-exceptions -fno-asynchronous-unwind-tables %s -o %t
 // RUN: %env_ubsan_opts=print_stacktrace=1:fast_unwind_on_fatal=0 not %run %t 2>&1 | FileCheck %s
 // RUN: %env_ubsan_opts=print_stacktrace=1:fast_unwind_on_fatal=1 not %run %t 2>&1 | FileCheck %s
 
diff --git a/compiler-rt/test/ubsan/TestCases/Misc/missing_return.cpp b/compiler-rt/test/ubsan/TestCases/Misc/missing_return.cpp
index 2ea76daf1fc16..6c1bc5525cd17 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/missing_return.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/missing_return.cpp
@@ -1,4 +1,4 @@
-// RUN: %clangxx -fsanitize=return %gmlt %s -O3 -o %t
+// RUN: %clangxx -Wno-error=return-type -fsanitize=return %gmlt %s -O3 -o %t
 // RUN: not %run %t 2>&1 | FileCheck %s
 // RUN: %env_ubsan_opts=print_stacktrace=1 not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-STACKTRACE
 // Error message does not exact what expected
diff --git a/flang/CMakeLists.txt b/flang/CMakeLists.txt
index ac8f784fd811e..4b703b456cae2 100644
--- a/flang/CMakeLists.txt
+++ b/flang/CMakeLists.txt
@@ -467,6 +467,17 @@ if (APPLE)
   endif()
 endif()
 
+# Set up job pools for flang.  Some of the flang sources take a lot of memory to
+# compile, so allow users to limit the number of parallel flang jobs.  This is
+# useful for building flang alongside several other projects since you can use
+# the maximum number of build jobs for the other projects while limiting the
+# number of flang compile jobs.
+set(FLANG_PARALLEL_COMPILE_JOBS CACHE STRING
+  "The maximum number of concurrent compilation jobs for Flang (Ninja only)")
+if (FLANG_PARALLEL_COMPILE_JOBS)
+  set_property(GLOBAL APPEND PROPERTY JOB_POOLS flang_compile_job_pool=${FLANG_PARALLEL_COMPILE_JOBS})
+endif()
+
 include(AddFlang)
 include(FlangCommon)
 
diff --git a/flang/cmake/modules/AddFlang.cmake b/flang/cmake/modules/AddFlang.cmake
index badbd4e7b964b..ca233103ccdbe 100644
--- a/flang/cmake/modules/AddFlang.cmake
+++ b/flang/cmake/modules/AddFlang.cmake
@@ -94,6 +94,9 @@ function(add_flang_library name)
       set_property(GLOBAL APPEND PROPERTY FLANG_LIBS ${name})
     endif()
     set_property(GLOBAL APPEND PROPERTY FLANG_EXPORTS ${name})
+    if (FLANG_PARALLEL_COMPILE_JOBS)
+      set_property(TARGET ${name} PROPERTY JOB_POOL_COMPILE flang_compile_job_pool)
+    endif()
   else()
     # Add empty "phony" target
     add_custom_target(${name})
diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
index 65732ce7f3224..caec6a913293f 100644
--- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
+++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h
@@ -335,6 +335,7 @@ struct IntrinsicLibrary {
   mlir::Value genMalloc(mlir::Type, llvm::ArrayRef<mlir::Value>);
   template <typename Shift>
   mlir::Value genMask(mlir::Type, llvm::ArrayRef<mlir::Value>);
+  mlir::Value genMatchAllSync(mlir::Type, llvm::ArrayRef<mlir::Value>);
   fir::ExtendedValue genMatmul(mlir::Type, llvm::ArrayRef<fir::ExtendedValue>);
   fir::ExtendedValue genMatmulTranspose(mlir::Type,
                                         llvm::ArrayRef<fir::ExtendedValue>);
diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h
index e82446a2ba884..56dcfa88ad92d 100644
--- a/flang/include/flang/Semantics/tools.h
+++ b/flang/include/flang/Semantics/tools.h
@@ -231,6 +231,7 @@ inline bool NeedCUDAAlloc(const Symbol &sym) {
         (*details->cudaDataAttr() == common::CUDADataAttr::Device ||
             *details->cudaDataAttr() == common::CUDADataAttr::Managed ||
             *details->cudaDataAttr() == common::CUDADataAttr::Unified ||
+            *details->cudaDataAttr() == common::CUDADataAttr::Shared ||
             *details->cudaDataAttr() == common::CUDADataAttr::Pinned)) {
       return true;
     }
diff --git a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
index 70fa18ad65b9b..436f7a1154c7c 100644
--- a/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
+++ b/flang/lib/Optimizer/Analysis/AliasAnalysis.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "flang/Optimizer/Analysis/AliasAnalysis.h"
+#include "flang/Optimizer/CodeGen/CGOps.h"
 #include "flang/Optimizer/Dialect/FIROps.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
@@ -61,13 +62,17 @@ getOriginalDef(mlir::Value v,
     mlir::Type ty = defOp->getResultTypes()[0];
     llvm::TypeSwitch<Operation *>(defOp)
         .Case<fir::ConvertOp>([&](fir::ConvertOp op) { v = op.getValue(); })
-        .Case<fir::DeclareOp, hlfir::DeclareOp>([&](auto op) {
-          v = op.getMemref();
-          auto varIf = llvm::cast<fir::FortranVariableOpInterface>(defOp);
-          attributes |= getAttrsFromVariable(varIf);
-          isCapturedInInternalProcedure |=
-              varIf.isCapturedInInternalProcedure();
-        })
+        .Case<fir::DeclareOp, hlfir::DeclareOp, fir::cg::XDeclareOp>(
+            [&](auto op) {
+              v = op.getMemref();
+              auto varIf =
+                  llvm::dyn_cast<fir::FortranVariableOpInterface>(defOp);
+              if (varIf) {
+                attributes |= getAttrsFromVariable(varIf);
+                isCapturedInInternalProcedure |=
+                    varIf.isCapturedInInternalProcedure();
+              }
+            })
         .Case<fir::CoordinateOp>([&](auto op) {
           if (fir::AliasAnalysis::isPointerReference(ty))
             attributes.set(fir::AliasAnalysis::Attribute::Pointer);
@@ -591,19 +596,21 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
             followBoxData = true;
           approximateSource = true;
         })
-        .Case<fir::EmboxOp, fir::ReboxOp>([&](auto op) {
-          if (followBoxData) {
-            v = op->getOperand(0);
-            defOp = v.getDefiningOp();
-          } else
-            breakFromLoop = true;
-        })
+        .Case<fir::EmboxOp, fir::ReboxOp, fir::cg::XEmboxOp, fir::cg::XReboxOp>(
+            [&](auto op) {
+              if (followBoxData) {
+                v = op->getOperand(0);
+                defOp = v.getDefiningOp();
+              } else
+                breakFromLoop = true;
+            })
         .Case<fir::LoadOp>([&](auto op) {
           // If load is inside target and it points to mapped item,
           // continue tracking.
           Operation *loadMemrefOp = op.getMemref().getDefiningOp();
           bool isDeclareOp =
               llvm::isa_and_present<fir::DeclareOp>(loadMemrefOp) ||
+              llvm::isa_and_present<fir::cg::XDeclareOp>(loadMemrefOp) ||
               llvm::isa_and_present<hlfir::DeclareOp>(loadMemrefOp);
           if (isDeclareOp &&
               llvm::isa<omp::TargetOp>(loadMemrefOp->getParentOp())) {
@@ -666,7 +673,8 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
           global = llvm::cast<fir::AddrOfOp>(op).getSymbol();
           breakFromLoop = true;
         })
-        .Case<hlfir::DeclareOp, fir::DeclareOp>([&](auto op) {
+        .Case<hlfir::DeclareOp, fir::DeclareOp,
+              fir::cg::XDeclareOp>([&](auto op) {
           bool isPrivateItem = false;
           if (omp::BlockArgOpenMPOpInterface argIface =
                   dyn_cast<omp::BlockArgOpenMPOpInterface>(op->getParentOp())) {
@@ -700,30 +708,33 @@ AliasAnalysis::Source AliasAnalysis::getSource(mlir::Value v,
               return;
             }
           }
-          auto varIf = llvm::cast<fir::FortranVariableOpInterface>(defOp);
-          // While going through a declare operation collect
-          // the variable attributes from it. Right now, some
-          // of the attributes are duplicated, e.g. a TARGET dummy
-          // argument has the target attribute both on its declare
-          // operation and on the entry block argument.
-          // In case of host associated use, the declare operation
-          // is the only carrier of the variable attributes,
-          // so we have to collect them here.
-          attributes |= getAttrsFromVariable(varIf);
-          isCapturedInInternalProcedure |=
-              varIf.isCapturedInInternalProcedure();
-          if (varIf.isHostAssoc()) {
-            // Do not track past such DeclareOp, because it does not
-            // currently provide any useful information. The host associated
-            // access will end up dereferencing the host association tuple,
-            // so we may as well stop right now.
-            v = defOp->getResult(0);
-            // TODO: if the host associated variable is a dummy argument
-            // of the host, I think, we can treat it as SourceKind::Argument
-            // for the purpose of alias analysis inside the internal procedure.
-            type = SourceKind::HostAssoc;
-            breakFromLoop = true;
-            return;
+          auto varIf = llvm::dyn_cast<fir::FortranVariableOpInterface>(defOp);
+          if (varIf) {
+            // While going through a declare operation collect
+            // the variable attributes from it. Right now, some
+            // of the attributes are duplicated, e.g. a TARGET dummy
+            // argument has the target attribute both on its declare
+            // operation and on the entry block argument.
+            // In case of host associated use, the declare operation
+            // is the only carrier of the variable attributes,
+            // so we have to collect them here.
+            attributes |= getAttrsFromVariable(varIf);
+            isCapturedInInternalProcedure |=
+                varIf.isCapturedInInternalProcedure();
+            if (varIf.isHostAssoc()) {
+              // Do not track past such DeclareOp, because it does not
+              // currently provide any useful information. The host associated
+              // access will end up dereferencing the host association tuple,
+              // so we may as well stop right now.
+              v = defOp->getResult(0);
+              // TODO: if the host associated variable is a dummy argument
+              // of the host, I think, we can treat it as SourceKind::Argument
+              // for the purpose of alias analysis inside the internal
+              // procedure.
+              type = SourceKind::HostAssoc;
+              breakFromLoop = true;
+              return;
+            }
           }
           if (getLastInstantiationPoint) {
             // Fetch only the innermost instantiation point.
diff --git a/flang/lib/Optimizer/Analysis/CMakeLists.txt b/flang/lib/Optimizer/Analysis/CMakeLists.txt
index 4d4ad882c27d3..3249f8a76ae3e 100644
--- a/flang/lib/Optimizer/Analysis/CMakeLists.txt
+++ b/flang/lib/Optimizer/Analysis/CMakeLists.txt
@@ -6,12 +6,14 @@ add_flang_library(FIRAnalysis
   FIRDialect
   FIRSupport
   HLFIRDialect
+  FIRCodeGen
 
   LINK_LIBS
   FIRBuilder
   FIRDialect
   FIRSupport
   HLFIRDialect
+  FIRCodeGen
 
   MLIR_DEPS
   MLIRIR
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 93744fa58ebc0..754496921ca3a 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -469,6 +469,22 @@ static constexpr IntrinsicHandler handlers[]{
     {"malloc", &I::genMalloc},
     {"maskl", &I::genMask<mlir::arith::ShLIOp>},
     {"maskr", &I::genMask<mlir::arith::ShRUIOp>},
+    {"match_all_syncjd",
+     &I::genMatchAllSync,
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjf",
+     &I::genMatchAllSync,
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjj",
+     &I::genMatchAllSync,
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
+    {"match_all_syncjx",
+     &I::genMatchAllSync,
+     {{{"mask", asValue}, {"value", asValue}, {"pred", asAddr}}},
+     /*isElemental=*/false},
     {"matmul",
      &I::genMatmul,
      {{{"matrix_a", asAddr}, {"matrix_b", asAddr}}},
@@ -6044,6 +6060,42 @@ mlir::Value IntrinsicLibrary::genMask(mlir::Type resultType,
   return result;
 }
 
+mlir::Value
+IntrinsicLibrary::genMatchAllSync(mlir::Type resultType,
+                                  llvm::ArrayRef<mlir::Value> args) {
+  assert(args.size() == 3);
+  bool is32 = args[1].getType().isInteger(32) || args[1].getType().isF32();
+
+  llvm::StringRef funcName =
+      is32 ? "llvm.nvvm.match.all.sync.i32p" : "llvm.nvvm.match.all.sync.i64p";
+  mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32Ty = builder.getI32Type();
+  mlir::Type i64Ty = builder.getI64Type();
+  mlir::Type i1Ty = builder.getI1Type();
+  mlir::Type retTy = mlir::TupleType::get(context, {resultType, i1Ty});
+  mlir::Type valTy = is32 ? i32Ty : i64Ty;
+
+  mlir::FunctionType ftype =
+      mlir::FunctionType::get(context, {i32Ty, valTy}, {retTy});
+  auto funcOp = builder.createFunction(loc, funcName, ftype);
+  llvm::SmallVector<mlir::Value> filteredArgs;
+  filteredArgs.push_back(args[0]);
+  if (args[1].getType().isF32() || args[1].getType().isF64())
+    filteredArgs.push_back(builder.create<fir::ConvertOp>(loc, valTy, args[1]));
+  else
+    filteredArgs.push_back(args[1]);
+  auto call = builder.create<fir::CallOp>(loc, funcOp, filteredArgs);
+  auto zero = builder.getIntegerAttr(builder.getIndexType(), 0);
+  auto value = builder.create<fir::ExtractValueOp>(
+      loc, resultType, call.getResult(0), builder.getArrayAttr(zero));
+  auto one = builder.getIntegerAttr(builder.getIndexType(), 1);
+  auto pred = builder.create<fir::ExtractValueOp>(loc, i1Ty, call.getResult(0),
+                                                  builder.getArrayAttr(one));
+  auto conv = builder.create<mlir::LLVM::ZExtOp>(loc, resultType, pred);
+  builder.create<fir::StoreOp>(loc, conv, args[2]);
+  return value;
+}
+
 // MATMUL
 fir::ExtendedValue
 IntrinsicLibrary::genMatmul(mlir::Type resultType,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index c76b7cde55bdd..439cc7a856236 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -292,6 +292,12 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
       rewriter.setInsertionPointAfter(size.getDefiningOp());
     }
 
+    if (auto dataAttr = alloc->getAttrOfType<cuf::DataAttributeAttr>(
+            cuf::getDataAttrName())) {
+      if (dataAttr.getValue() == cuf::DataAttribute::Shared)
+        allocaAs = 3;
+    }
+
     // NOTE: we used to pass alloc->getAttrs() in the builder for non opaque
     // pointers! Only propagate pinned and bindc_name to help debugging, but
     // this should have no functional purpose (and passing the operand segment
@@ -316,6 +322,7 @@ struct AllocaOpConversion : public fir::FIROpConversion<fir::AllocaOp> {
       rewriter.replaceOpWithNewOp<mlir::LLVM::AddrSpaceCastOp>(
           alloc, ::getLlvmPtrType(alloc.getContext(), programAs), llvmAlloc);
     }
+
     return mlir::success();
   }
 };
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index b05991a29a321..fa82f3916a57e 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -57,7 +57,8 @@ static llvm::LogicalResult checkCudaAttr(Op op) {
   if (op.getDataAttr() == cuf::DataAttribute::Device ||
       op.getDataAttr() == cuf::DataAttribute::Managed ||
       op.getDataAttr() == cuf::DataAttribute::Unified ||
-      op.getDataAttr() == cuf::DataAttribute::Pinned)
+      op.getDataAttr() == cuf::DataAttribute::Pinned ||
+      op.getDataAttr() == cuf::DataAttribute::Shared)
     return mlir::success();
   return op.emitOpError()
          << "expect device, managed, pinned or unified cuda attribute";
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index e473590a7d78f..c75c5c191ab51 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -562,4 +562,31 @@ attributes(device) integer(8) function clock64()
     end function
   end interface
 
+interface match_all_sync
+  attributes(device) integer function match_all_syncjj(mask, val, pred)
+!dir$ ignore_tkr(d) mask, (d) val, (d) pred
+  integer(4), value :: mask
+  integer(4), value :: val
+  integer(4)        :: pred
+  end function
+  attributes(device) integer function match_all_syncjx(mask, val, pred)
+!dir$ ignore_tkr(d) mask, (d) val, (d) pred
+  integer(4), value :: mask
+  integer(8), value :: val
+  integer(4)        :: pred
+  end function
+  attributes(device) integer function match_all_syncjf(mask, val, pred)
+!dir$ ignore_tkr(d) mask, (d) val, (d) pred
+  integer(4), value :: mask
+  real(4), value    :: val
+  integer(4)        :: pred
+  end function
+  attributes(device) integer function match_all_syncjd(mask, val, pred)
+!dir$ ignore_tkr(d) mask, (d) val, (d) pred
+  integer(4), value :: mask
+  real(8), value    :: val
+  integer(4)        :: pred
+  end function
+end interface
+
 end module
diff --git a/flang/test/Analysis/AliasAnalysis/fircg-as-sources.fir b/flang/test/Analysis/AliasAnalysis/fircg-as-sources.fir
new file mode 100644
index 0000000000000..edb3b1dadb8cd
--- /dev/null
+++ b/flang/test/Analysis/AliasAnalysis/fircg-as-sources.fir
@@ -0,0 +1,108 @@
+// Check aliasing with the address *in* (not *of*) a local (fir.alloca) pointer
+// variable.
+//
+// Throughout this test, the ".fir" suffix on symbols indicates a version of the
+// MLIR after convert-hlfir-to-fir.  We would like alias analysis results to be
+// the same in both versions.
+
+// RUN: fir-opt %s -split-input-file -o /dev/null --mlir-disable-threading  \
+// RUN:   -pass-pipeline='builtin.module(func.func(test-fir-alias-analysis))' \
+// RUN:   2>&1 | FileCheck -match-full-lines %s
+
+// subroutine test(p1, arr, t_arr, alloc, t_alloc, t, v)
+//   real, pointer :: p1
+//   real :: arr(:)
+//   real, target :: t_arr(:)
+//   real, allocatable :: alloc
+//   real, allocatable, target :: t_alloc
+//   real, target :: t
+//   real :: v
+//   real, pointer :: p0
+// end subroutine test
+
+// check when fircg.ext_rebox and fircg.ext_declare are in the path of tracing the source
+// CHECK-LABEL: Testing : "_QPtest.fir"
+// CHECK-DAG: p0.tgt.fir#0 <-> arr(1).fir#0: NoAlias
+// CHECK-DAG: p0.tgt.fir#0 <-> t_arr(1).fir#0: MayAlias
+// CHECK-DAG: p0.tgt.fir#0 <-> alloc.tgt.fir#0: NoAlias
+// CHECK-DAG: p0.tgt.fir#0 <-> t_alloc.tgt.fir#0: MayAlias
+// CHECK-DAG: alloc.fir#0 <-> alloc.tgt.fir#0: NoAlias
+
+func.func @_QPtest.fir(%arg0: !fir.ref<!fir.box<!fir.ptr<f32>>> {fir.bindc_name = "p1"}, %arg1: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "arr"}, %arg2: !fir.box<!fir.array<?xf32>> {fir.bindc_name = "t_arr", fir.target}, %arg3: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "alloc"}, %arg4: !fir.ref<!fir.box<!fir.heap<f32>>> {fir.bindc_name = "t_alloc", fir.target}, %arg5: !fir.ref<f32> {fir.bindc_name = "t", fir.target}, %arg6: !fir.ref<f32> {fir.bindc_name = "v"}) {
+  %0 = fir.dummy_scope : !fir.dscope
+  %1 = fircg.ext_declare %arg3 dummy_scope %0 {test.ptr = "alloc.fir", fortran_attrs = #fir.var_attrs<allocatable>, uniq_name = "_QFtestEalloc"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+  %2 = fir.declare %arg1 dummy_scope %0 {uniq_name = "_QFtestEarr"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+  %3 = fircg.ext_rebox %2 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+  %4 = fir.alloca !fir.box<!fir.ptr<f32>> {bindc_name = "p0", uniq_name = "_QFtestEp0"}
+  %5 = fircg.ext_declare %4 {test.ptr = "p0.fir", fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtestEp0"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>) -> !fir.ref<!fir.box<!fir.ptr<f32>>>
+  %6 = fir.declare %arg0 dummy_scope %0 {test.ptr = "p1.fir", fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtestEp1"} : (!fir.ref<!fir.box<!fir.ptr<f32>>>, !fir.dscope) -> !fir.ref<!fir.box<!fir.ptr<f32>>>
+  %7 = fir.declare %arg5 dummy_scope %0 {test.ptr = "t.fir", fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtestEt"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  %8 = fir.declare %arg4 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFtestEt_alloc"} : (!fir.ref<!fir.box<!fir.heap<f32>>>, !fir.dscope) -> !fir.ref<!fir.box<!fir.heap<f32>>>
+  %9 = fir.declare %arg2 dummy_scope %0 {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFtestEt_arr"} : (!fir.box<!fir.array<?xf32>>, !fir.dscope) -> !fir.box<!fir.array<?xf32>>
+  %10 = fircg.ext_rebox %9 : (!fir.box<!fir.array<?xf32>>) -> !fir.box<!fir.array<?xf32>>
+  %11 = fir.declare %arg6 dummy_scope %0 {test.ptr = "v.fir", uniq_name = "_QFtestEv"} : (!fir.ref<f32>, !fir.dscope) -> !fir.ref<f32>
+  %12 = fir.load %5 : !fir.ref<!fir.box<!fir.ptr<f32>>>
+  %13 = fir.box_addr %12 {test.ptr = "p0.tgt.fir"} : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
+  %14 = fir.load %6 : !fir.ref<!fir.box<!fir.ptr<f32>>>
+  %15 = fir.box_addr %14 {test.ptr = "p1.tgt.fir"} : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
+  %c1 = arith.constant 1 : index
+  %16 = fir.array_coor %3 %c1 {test.ptr="arr(1).fir"} : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+  %c1_0 = arith.constant 1 : index
+  %17 = fir.array_coor %10 %c1_0 {test.ptr="t_arr(1).fir"} : (!fir.box<!fir.array<?xf32>>, index) -> !fir.ref<f32>
+  %18 = fir.load %1 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  %19 = fir.box_addr %18 {test.ptr = "alloc.tgt.fir"} : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+  %20 = fir.load %8 : !fir.ref<!fir.box<!fir.heap<f32>>>
+  %21 = fir.box_addr %20 {test.ptr = "t_alloc.tgt.fir"} : (!fir.box<!fir.heap<f32>>) -> !fir.heap<f32>
+  return
+}
+
+// -----
+// CHECK-LABEL: Testing : "_QFPtest3"
+
+// module pointers
+//   real, pointer :: p
+// end module
+//
+// program main
+//   use pointers
+//   real, target :: var1 = 1, var2 =2
+//   p => var1
+//
+//   call test3(p)
+//
+// contains
+//   subroutine test3(p1)
+//     real, pointer :: p1
+//     p1 => var2
+//     print *, p
+//   end subroutine
+// end
+
+// check when there are fircg.ext_embox in the paths
+// CHECK-DAG: p#0 <-> box.addr#0: NoAlias
+// CHECK-DAG: box.addr#0 <-> func.region0#0: NoAlias
+// CHECK-DAG: var2#0 <-> p#0: NoAlias
+// CHECK-DAG: var2#0 <-> box.addr#0: MustAlias
+// CHECK-DAG: var2#0 <-> func.region0#1: NoAlias
+// CHECK-DAG: box.addr#0 <-> func.region0#1: NoAlias
+
+fir.global @_QMpointersEp : !fir.box<!fir.ptr<f32>> {
+  %0 = fir.zero_bits !fir.ptr<f32>
+  %1 = fircg.ext_embox %0 : (!fir.ptr<f32>) -> !fir.box<!fir.ptr<f32>>
+  fir.has_value %1 : !fir.box<!fir.ptr<f32>>
+}
+
+fir.global internal @_QFEvar2 target : f32 {
+  %cst = arith.constant 2.000000e+00 : f32
+  fir.has_value %cst : f32
+}
+
+func.func @_QFPtest3(%arg0: !fir.ref<!fir.box<!fir.ptr<f32>>> {fir.bindc_name = "p1"}, %arg1: !fir.ref<f32>) attributes {test.ptr = "func"} {
+  %3 = fir.load %arg0 {test.ptr = "arg0.load"}: !fir.ref<!fir.box<!fir.ptr<f32>>>
+  %4 = fir.address_of(@_QFEvar2) {test.ptr = "var2"} : !fir.ref<f32>
+  %5 = fir.address_of(@_QMpointersEp) {test.ptr = "p"} : !fir.ref<!fir.box<!fir.ptr<f32>>>
+  %6 = fircg.ext_embox %4 : (!fir.ref<f32>) -> !fir.box<!fir.ptr<f32>>
+  %13 = fir.box_addr %6 {test.ptr = "box.addr"} : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
+  return
+}
+
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 6a5524102c0ea..1210dae8608c8 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -112,6 +112,25 @@ end
 ! CHECK: fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 ! CHECK: fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32
 
+attributes(device) subroutine testMatch()
+  integer :: a, ipred, mask, v32
+  integer(8) :: v64
+  real(4) :: r4
+  real(8) :: r8
+  a = match_all_sync(mask, v32, ipred)
+  a = match_all_sync(mask, v64, ipred)
+  a = match_all_sync(mask, r4, ipred)
+  a = match_all_sync(mask, r8, ipred)
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtestmatch()
+! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p
+! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p
+! CHECK: fir.convert %{{.*}} : (f32) -> i32
+! CHECK: fir.call @llvm.nvvm.match.all.sync.i32p
+! CHECK: fir.convert %{{.*}} : (f64) -> i64
+! CHECK: fir.call @llvm.nvvm.match.all.sync.i64p
+
 ! CHECK: func.func private @llvm.nvvm.barrier0()
 ! CHECK: func.func private @llvm.nvvm.bar.warp.sync(i32)
 ! CHECK: func.func private @llvm.nvvm.membar.gl()
@@ -120,3 +139,5 @@ end
 ! CHECK: func.func private @llvm.nvvm.barrier0.and(i32) -> i32
 ! CHECK: func.func private @llvm.nvvm.barrier0.popc(i32) -> i32
 ! CHECK: func.func private @llvm.nvvm.barrier0.or(i32) -> i32
+! CHECK: func.func private @llvm.nvvm.match.all.sync.i32p(i32, i32) -> tuple<i32, i1>
+! CHECK: func.func private @llvm.nvvm.match.all.sync.i64p(i32, i64) -> tuple<i32, i1>
diff --git a/libc/src/stdio/scanf_core/CMakeLists.txt b/libc/src/stdio/scanf_core/CMakeLists.txt
index a8935d464417c..35b8b3d318a9f 100644
--- a/libc/src/stdio/scanf_core/CMakeLists.txt
+++ b/libc/src/stdio/scanf_core/CMakeLists.txt
@@ -54,15 +54,26 @@ add_object_library(
     libc.src.__support.arg_list
 )
 
-add_object_library(
+if(LIBC_TARGET_OS_IS_GPU)
+add_header_library(
+  reader
+  HDRS
+    reader.h
+  DEPENDS
+    libc.src.__support.macros.attributes
+)
+elseif((TARGET libc.src.__support.File.file) OR (NOT LLVM_LIBC_FULL_BUILD))
+add_header_library(
   reader
-  SRCS
-    reader.cpp
   HDRS
     reader.h
   DEPENDS
     libc.src.__support.macros.attributes
+    libc.hdr.types.FILE
+    libc.src.__support.File.file
+  ${use_system_file}
 )
+endif()
 
 add_object_library(
   converter
diff --git a/libc/src/stdio/scanf_core/reader.cpp b/libc/src/stdio/scanf_core/reader.cpp
deleted file mode 100644
index ec1f5c098dc7a..0000000000000
--- a/libc/src/stdio/scanf_core/reader.cpp
+++ /dev/null
@@ -1,29 +0,0 @@
-//===-- Reader definition for scanf -----------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/stdio/scanf_core/reader.h"
-#include "src/__support/macros/config.h"
-#include <stddef.h>
-
-namespace LIBC_NAMESPACE_DECL {
-namespace scanf_core {
-
-void Reader::ungetc(char c) {
-  --cur_chars_read;
-  if (rb != nullptr && rb->buff_cur > 0) {
-    // While technically c should be written back to the buffer, in scanf we
-    // always write the character that was already there. Additionally, the
-    // buffer is most likely to contain a string that isn't part of a file,
-    // which may not be writable.
-    --(rb->buff_cur);
-    return;
-  }
-  stream_ungetc(static_cast<int>(c), input_stream);
-}
-} // namespace scanf_core
-} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdio/scanf_core/reader.h b/libc/src/stdio/scanf_core/reader.h
index f984fd9378910..1f8ec9695a314 100644
--- a/libc/src/stdio/scanf_core/reader.h
+++ b/libc/src/stdio/scanf_core/reader.h
@@ -9,15 +9,73 @@
 #ifndef LLVM_LIBC_SRC_STDIO_SCANF_CORE_READER_H
 #define LLVM_LIBC_SRC_STDIO_SCANF_CORE_READER_H
 
+#include "hdr/types/FILE.h"
+
+#ifndef LIBC_COPT_STDIO_USE_SYSTEM_FILE
+#include "src/__support/File/file.h"
+#endif
+
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+#include "src/stdio/getc.h"
+#include "src/stdio/ungetc.h"
+#endif
+
 #include "src/__support/macros/attributes.h" // For LIBC_INLINE
 #include "src/__support/macros/config.h"
+
 #include <stddef.h>
 
 namespace LIBC_NAMESPACE_DECL {
 namespace scanf_core {
-
-using StreamGetc = int (*)(void *);
-using StreamUngetc = void (*)(int, void *);
+// We use the name "reader_internal" over "internal" because
+// "internal" causes name lookups in files that include the current header to be
+// ambigious i.e. `internal::foo` in those files, will try to lookup in
+// `LIBC_NAMESPACE::scanf_core::internal` over `LIBC_NAMESPACE::internal` for
+// e.g., `internal::ArgList` in `libc/src/stdio/scanf_core/scanf_main.h`
+namespace reader_internal {
+
+#if defined(LIBC_TARGET_ARCH_IS_GPU)
+// The GPU build provides FILE access through the host operating system's
+// library. So here we simply use the public entrypoints like in the SYSTEM_FILE
+// interface. Entrypoints should normally not call others, this is an exception.
+// FIXME: We do not acquire any locks here, so this is not thread safe.
+LIBC_INLINE int getc(void *f) {
+  return LIBC_NAMESPACE::getc(reinterpret_cast<::FILE *>(f));
+}
+
+LIBC_INLINE void ungetc(int c, void *f) {
+  LIBC_NAMESPACE::ungetc(c, reinterpret_cast<::FILE *>(f));
+}
+
+#elif !defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE)
+
+LIBC_INLINE int getc(void *f) {
+  unsigned char c;
+  auto result =
+      reinterpret_cast<LIBC_NAMESPACE::File *>(f)->read_unlocked(&c, 1);
+  size_t r = result.value;
+  if (result.has_error() || r != 1)
+    return '\0';
+
+  return c;
+}
+
+LIBC_INLINE void ungetc(int c, void *f) {
+  reinterpret_cast<LIBC_NAMESPACE::File *>(f)->ungetc_unlocked(c);
+}
+
+#else  // defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE)
+
+// Since ungetc_unlocked isn't always available, we don't acquire the lock for
+// system files.
+LIBC_INLINE int getc(void *f) { return ::getc(reinterpret_cast<::FILE *>(f)); }
+
+LIBC_INLINE void ungetc(int c, void *f) {
+  ::ungetc(c, reinterpret_cast<::FILE *>(f));
+}
+#endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
+
+} // namespace reader_internal
 
 // This is intended to be either a raw string or a buffer syncronized with the
 // file's internal buffer.
@@ -29,24 +87,15 @@ struct ReadBuffer {
 
 class Reader {
   ReadBuffer *rb;
-
   void *input_stream = nullptr;
-
-  // TODO: Remove these unnecessary function pointers
-  StreamGetc stream_getc = nullptr;
-  StreamUngetc stream_ungetc = nullptr;
-
   size_t cur_chars_read = 0;
 
 public:
   // TODO: Set buff_len with a proper constant
   LIBC_INLINE Reader(ReadBuffer *string_buffer) : rb(string_buffer) {}
 
-  LIBC_INLINE Reader(void *stream, StreamGetc stream_getc_in,
-                     StreamUngetc stream_ungetc_in,
-                     ReadBuffer *stream_buffer = nullptr)
-      : rb(stream_buffer), input_stream(stream), stream_getc(stream_getc_in),
-        stream_ungetc(stream_ungetc_in) {}
+  LIBC_INLINE Reader(void *stream, ReadBuffer *stream_buffer = nullptr)
+      : rb(stream_buffer), input_stream(stream) {}
 
   // This returns the next character from the input and advances it by one
   // character. When it hits the end of the string or file it returns '\0' to
@@ -59,12 +108,23 @@ class Reader {
       return output;
     }
     // This should reset the buffer if applicable.
-    return static_cast<char>(stream_getc(input_stream));
+    return static_cast<char>(reader_internal::getc(input_stream));
   }
 
   // This moves the input back by one character, placing c into the buffer if
   // this is a file reader, else c is ignored.
-  void ungetc(char c);
+  LIBC_INLINE void ungetc(char c) {
+    --cur_chars_read;
+    if (rb != nullptr && rb->buff_cur > 0) {
+      // While technically c should be written back to the buffer, in scanf we
+      // always write the character that was already there. Additionally, the
+      // buffer is most likely to contain a string that isn't part of a file,
+      // which may not be writable.
+      --(rb->buff_cur);
+      return;
+    }
+    reader_internal::ungetc(static_cast<int>(c), input_stream);
+  }
 
   LIBC_INLINE size_t chars_read() { return cur_chars_read; }
 };
diff --git a/libc/src/stdio/scanf_core/vfscanf_internal.h b/libc/src/stdio/scanf_core/vfscanf_internal.h
index 67126431fcded..4e20fa3b93091 100644
--- a/libc/src/stdio/scanf_core/vfscanf_internal.h
+++ b/libc/src/stdio/scanf_core/vfscanf_internal.h
@@ -18,8 +18,6 @@
 
 #if defined(LIBC_TARGET_ARCH_IS_GPU)
 #include "src/stdio/ferror.h"
-#include "src/stdio/getc.h"
-#include "src/stdio/ungetc.h"
 #endif
 
 #include "hdr/types/FILE.h"
@@ -38,14 +36,6 @@ LIBC_INLINE void flockfile(::FILE *) { return; }
 
 LIBC_INLINE void funlockfile(::FILE *) { return; }
 
-LIBC_INLINE int getc(void *f) {
-  return LIBC_NAMESPACE::getc(reinterpret_cast<::FILE *>(f));
-}
-
-LIBC_INLINE void ungetc(int c, void *f) {
-  LIBC_NAMESPACE::ungetc(c, reinterpret_cast<::FILE *>(f));
-}
-
 LIBC_INLINE int ferror_unlocked(::FILE *f) { return LIBC_NAMESPACE::ferror(f); }
 
 #elif !defined(LIBC_COPT_STDIO_USE_SYSTEM_FILE)
@@ -58,21 +48,6 @@ LIBC_INLINE void funlockfile(FILE *f) {
   reinterpret_cast<LIBC_NAMESPACE::File *>(f)->unlock();
 }
 
-LIBC_INLINE int getc(void *f) {
-  unsigned char c;
-  auto result =
-      reinterpret_cast<LIBC_NAMESPACE::File *>(f)->read_unlocked(&c, 1);
-  size_t r = result.value;
-  if (result.has_error() || r != 1)
-    return '\0';
-
-  return c;
-}
-
-LIBC_INLINE void ungetc(int c, void *f) {
-  reinterpret_cast<LIBC_NAMESPACE::File *>(f)->ungetc_unlocked(c);
-}
-
 LIBC_INLINE int ferror_unlocked(FILE *f) {
   return reinterpret_cast<LIBC_NAMESPACE::File *>(f)->error_unlocked();
 }
@@ -85,12 +60,6 @@ LIBC_INLINE void flockfile(::FILE *) { return; }
 
 LIBC_INLINE void funlockfile(::FILE *) { return; }
 
-LIBC_INLINE int getc(void *f) { return ::getc(reinterpret_cast<::FILE *>(f)); }
-
-LIBC_INLINE void ungetc(int c, void *f) {
-  ::ungetc(c, reinterpret_cast<::FILE *>(f));
-}
-
 LIBC_INLINE int ferror_unlocked(::FILE *f) { return ::ferror(f); }
 
 #endif // LIBC_COPT_STDIO_USE_SYSTEM_FILE
@@ -103,7 +72,7 @@ LIBC_INLINE int vfscanf_internal(::FILE *__restrict stream,
                                  const char *__restrict format,
                                  internal::ArgList &args) {
   internal::flockfile(stream);
-  scanf_core::Reader reader(stream, &internal::getc, internal::ungetc);
+  scanf_core::Reader reader(stream);
   int retval = scanf_core::scanf_main(&reader, format, args);
   if (retval == 0 && internal::ferror_unlocked(stream))
     retval = EOF;
diff --git a/libcxx/src/experimental/tzdb.cpp b/libcxx/src/experimental/tzdb.cpp
index 1f18226636fd5..ac5c62bb81902 100644
--- a/libcxx/src/experimental/tzdb.cpp
+++ b/libcxx/src/experimental/tzdb.cpp
@@ -763,8 +763,9 @@ void __init_tzdb(tzdb& __tzdb, __tz::__rules_storage_type& __rules) {
   // - The file /etc/timezone. This text file contains the name of the time
   //   zone.
   //
-  // On Linux systems it seems /etc/timezone is deprecated and being phased
-  // out. This file is used when /etc/localtime does not exist, or when it exists but is not a symlink. For more information and links see
+  // On Linux systems it seems /etc/timezone is deprecated and being phased out.
+  // This file is used when /etc/localtime does not exist, or when it exists but
+  // is not a symlink. For more information and links see
   // https://github.com/llvm/llvm-project/issues/105634
 
   string __name = chrono::__current_zone_environment();
diff --git a/lldb/include/lldb/Core/Debugger.h b/lldb/include/lldb/Core/Debugger.h
index 9c8a9623fe689..6ebc6147800e1 100644
--- a/lldb/include/lldb/Core/Debugger.h
+++ b/lldb/include/lldb/Core/Debugger.h
@@ -156,9 +156,9 @@ class Debugger : public std::enable_shared_from_this<Debugger>,
 
   void RestoreInputTerminalState();
 
-  lldb::StreamSP GetAsyncOutputStream();
+  lldb::StreamUP GetAsyncOutputStream();
 
-  lldb::StreamSP GetAsyncErrorStream();
+  lldb::StreamUP GetAsyncErrorStream();
 
   CommandInterpreter &GetCommandInterpreter() {
     assert(m_command_interpreter_up.get());
diff --git a/lldb/include/lldb/lldb-forward.h b/lldb/include/lldb/lldb-forward.h
index cda55ef06e549..c664d1398f74d 100644
--- a/lldb/include/lldb/lldb-forward.h
+++ b/lldb/include/lldb/lldb-forward.h
@@ -432,6 +432,7 @@ typedef std::unique_ptr<lldb_private::StackFrameRecognizerManager>
     StackFrameRecognizerManagerUP;
 typedef std::shared_ptr<lldb_private::StopInfo> StopInfoSP;
 typedef std::shared_ptr<lldb_private::Stream> StreamSP;
+typedef std::unique_ptr<lldb_private::Stream> StreamUP;
 typedef std::shared_ptr<lldb_private::StreamFile> StreamFileSP;
 typedef std::shared_ptr<lldb_private::LockableStreamFile> LockableStreamFileSP;
 typedef std::shared_ptr<lldb_private::StringSummaryFormat>
diff --git a/lldb/source/Breakpoint/BreakpointOptions.cpp b/lldb/source/Breakpoint/BreakpointOptions.cpp
index 09abcf5e081d2..242b5b30168c5 100644
--- a/lldb/source/Breakpoint/BreakpointOptions.cpp
+++ b/lldb/source/Breakpoint/BreakpointOptions.cpp
@@ -620,10 +620,8 @@ bool BreakpointOptions::BreakpointOptionsCallbackFunction(
 
       // Rig up the results secondary output stream to the debugger's, so the
       // output will come out synchronously if the debugger is set up that way.
-      StreamSP output_stream(debugger.GetAsyncOutputStream());
-      StreamSP error_stream(debugger.GetAsyncErrorStream());
-      result.SetImmediateOutputStream(output_stream);
-      result.SetImmediateErrorStream(error_stream);
+      result.SetImmediateOutputStream(debugger.GetAsyncOutputStream());
+      result.SetImmediateErrorStream(debugger.GetAsyncErrorStream());
 
       CommandInterpreterRunOptions options;
       options.SetStopOnContinue(true);
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index dd841cb5cb4cc..9510cf4d14467 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -815,10 +815,9 @@ a number follows 'f':"
         for (const std::string &line : lines) {
           Status error = AppendRegexSubstitution(line, check_only);
           if (error.Fail()) {
-            if (!GetDebugger().GetCommandInterpreter().GetBatchCommandMode()) {
-              StreamSP out_stream = GetDebugger().GetAsyncOutputStream();
-              out_stream->Printf("error: %s\n", error.AsCString());
-            }
+            if (!GetDebugger().GetCommandInterpreter().GetBatchCommandMode())
+              GetDebugger().GetAsyncOutputStream()->Printf("error: %s\n",
+                                                           error.AsCString());
           }
         }
       }
diff --git a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
index 507ef3fbe4759..32cb80b421fd6 100644
--- a/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
+++ b/lldb/source/Commands/CommandObjectWatchpointCommand.cpp
@@ -252,10 +252,8 @@ are no syntax errors may indicate that a function was declared but never called.
         // Rig up the results secondary output stream to the debugger's, so the
         // output will come out synchronously if the debugger is set up that
         // way.
-        StreamSP output_stream(debugger.GetAsyncOutputStream());
-        StreamSP error_stream(debugger.GetAsyncErrorStream());
-        result.SetImmediateOutputStream(output_stream);
-        result.SetImmediateErrorStream(error_stream);
+        result.SetImmediateOutputStream(debugger.GetAsyncOutputStream());
+        result.SetImmediateErrorStream(debugger.GetAsyncErrorStream());
 
         CommandInterpreterRunOptions options;
         options.SetStopOnContinue(true);
diff --git a/lldb/source/Core/Debugger.cpp b/lldb/source/Core/Debugger.cpp
index 242ef1c8a4596..585138535203d 100644
--- a/lldb/source/Core/Debugger.cpp
+++ b/lldb/source/Core/Debugger.cpp
@@ -257,7 +257,7 @@ Status Debugger::SetPropertyValue(const ExecutionContext *exe_ctx,
         std::list<Status> errors;
         StreamString feedback_stream;
         if (!target_sp->LoadScriptingResources(errors, feedback_stream)) {
-          lldb::StreamSP s = GetAsyncErrorStream();
+          lldb::StreamUP s = GetAsyncErrorStream();
           for (auto &error : errors)
             s->Printf("%s\n", error.AsCString());
           if (feedback_stream.GetSize())
@@ -1328,13 +1328,13 @@ bool Debugger::PopIOHandler(const IOHandlerSP &pop_reader_sp) {
   return true;
 }
 
-StreamSP Debugger::GetAsyncOutputStream() {
-  return std::make_shared<StreamAsynchronousIO>(*this,
+StreamUP Debugger::GetAsyncOutputStream() {
+  return std::make_unique<StreamAsynchronousIO>(*this,
                                                 StreamAsynchronousIO::STDOUT);
 }
 
-StreamSP Debugger::GetAsyncErrorStream() {
-  return std::make_shared<StreamAsynchronousIO>(*this,
+StreamUP Debugger::GetAsyncErrorStream() {
+  return std::make_unique<StreamAsynchronousIO>(*this,
                                                 StreamAsynchronousIO::STDERR);
 }
 
@@ -1577,8 +1577,7 @@ static void PrivateReportDiagnostic(Debugger &debugger, Severity severity,
     // diagnostic directly to the debugger's error stream.
     DiagnosticEventData event_data(severity, std::move(message),
                                    debugger_specific);
-    StreamSP stream = debugger.GetAsyncErrorStream();
-    event_data.Dump(stream.get());
+    event_data.Dump(debugger.GetAsyncErrorStream().get());
     return;
   }
   EventSP event_sp = std::make_shared<Event>(
@@ -1774,12 +1773,11 @@ void Debugger::HandleBreakpointEvent(const EventSP &event_sp) {
     if (num_new_locations > 0) {
       BreakpointSP breakpoint =
           Breakpoint::BreakpointEventData::GetBreakpointFromEvent(event_sp);
-      StreamSP output_sp(GetAsyncOutputStream());
-      if (output_sp) {
-        output_sp->Printf("%d location%s added to breakpoint %d\n",
+      if (StreamUP output_up = GetAsyncOutputStream()) {
+        output_up->Printf("%d location%s added to breakpoint %d\n",
                           num_new_locations, num_new_locations == 1 ? "" : "s",
                           breakpoint->GetID());
-        output_sp->Flush();
+        output_up->Flush();
       }
     }
   }
@@ -1823,8 +1821,8 @@ void Debugger::HandleProcessEvent(const EventSP &event_sp) {
           ? EventDataStructuredData::GetProcessFromEvent(event_sp.get())
           : Process::ProcessEventData::GetProcessFromEvent(event_sp.get());
 
-  StreamSP output_stream_sp = GetAsyncOutputStream();
-  StreamSP error_stream_sp = GetAsyncErrorStream();
+  StreamUP output_stream_up = GetAsyncOutputStream();
+  StreamUP error_stream_up = GetAsyncErrorStream();
   const bool gui_enabled = IsForwardingEvents();
 
   if (!gui_enabled) {
@@ -1849,7 +1847,7 @@ void Debugger::HandleProcessEvent(const EventSP &event_sp) {
     if (got_state_changed && !state_is_stopped) {
       // This is a public stop which we are going to announce to the user, so
       // we should force the most relevant frame selection here.
-      Process::HandleProcessStateChangedEvent(event_sp, output_stream_sp.get(),
+      Process::HandleProcessStateChangedEvent(event_sp, output_stream_up.get(),
                                               SelectMostRelevantFrame,
                                               pop_process_io_handler);
     }
@@ -1865,37 +1863,35 @@ void Debugger::HandleProcessEvent(const EventSP &event_sp) {
       if (plugin_sp) {
         auto structured_data_sp =
             EventDataStructuredData::GetObjectFromEvent(event_sp.get());
-        if (output_stream_sp) {
-          StreamString content_stream;
-          Status error =
-              plugin_sp->GetDescription(structured_data_sp, content_stream);
-          if (error.Success()) {
-            if (!content_stream.GetString().empty()) {
-              // Add newline.
-              content_stream.PutChar('\n');
-              content_stream.Flush();
-
-              // Print it.
-              output_stream_sp->PutCString(content_stream.GetString());
-            }
-          } else {
-            error_stream_sp->Format("Failed to print structured "
-                                    "data with plugin {0}: {1}",
-                                    plugin_sp->GetPluginName(), error);
+        StreamString content_stream;
+        Status error =
+            plugin_sp->GetDescription(structured_data_sp, content_stream);
+        if (error.Success()) {
+          if (!content_stream.GetString().empty()) {
+            // Add newline.
+            content_stream.PutChar('\n');
+            content_stream.Flush();
+
+            // Print it.
+            output_stream_up->PutCString(content_stream.GetString());
           }
+        } else {
+          error_stream_up->Format("Failed to print structured "
+                                  "data with plugin {0}: {1}",
+                                  plugin_sp->GetPluginName(), error);
         }
       }
     }
 
     // Now display any stopped state changes after any STDIO
     if (got_state_changed && state_is_stopped) {
-      Process::HandleProcessStateChangedEvent(event_sp, output_stream_sp.get(),
+      Process::HandleProcessStateChangedEvent(event_sp, output_stream_up.get(),
                                               SelectMostRelevantFrame,
                                               pop_process_io_handler);
     }
 
-    output_stream_sp->Flush();
-    error_stream_sp->Flush();
+    output_stream_up->Flush();
+    error_stream_up->Flush();
 
     if (pop_process_io_handler)
       process_sp->PopProcessIOHandler();
@@ -1995,22 +1991,18 @@ lldb::thread_result_t Debugger::DefaultEventHandler() {
               const char *data = static_cast<const char *>(
                   EventDataBytes::GetBytesFromEvent(event_sp.get()));
               if (data && data[0]) {
-                StreamSP error_sp(GetAsyncErrorStream());
-                if (error_sp) {
-                  error_sp->PutCString(data);
-                  error_sp->Flush();
-                }
+                StreamUP error_up = GetAsyncErrorStream();
+                error_up->PutCString(data);
+                error_up->Flush();
               }
             } else if (event_type & CommandInterpreter::
                                         eBroadcastBitAsynchronousOutputData) {
               const char *data = static_cast<const char *>(
                   EventDataBytes::GetBytesFromEvent(event_sp.get()));
               if (data && data[0]) {
-                StreamSP output_sp(GetAsyncOutputStream());
-                if (output_sp) {
-                  output_sp->PutCString(data);
-                  output_sp->Flush();
-                }
+                StreamUP output_up = GetAsyncOutputStream();
+                output_up->PutCString(data);
+                output_up->Flush();
               }
             }
           } else if (broadcaster == &m_broadcaster) {
@@ -2125,7 +2117,7 @@ void Debugger::HandleProgressEvent(const lldb::EventSP &event_sp) {
   if (!file_sp->GetIsInteractive() || !file_sp->GetIsTerminalWithColors())
     return;
 
-  StreamSP output = GetAsyncOutputStream();
+  StreamUP output = GetAsyncOutputStream();
 
   // Print over previous line, if any.
   output->Printf("\r");
@@ -2175,8 +2167,7 @@ void Debugger::HandleDiagnosticEvent(const lldb::EventSP &event_sp) {
   if (!data)
     return;
 
-  StreamSP stream = GetAsyncErrorStream();
-  data->Dump(stream.get());
+  data->Dump(GetAsyncErrorStream().get());
 }
 
 bool Debugger::HasIOHandlerThread() const {
diff --git a/lldb/source/Core/DynamicLoader.cpp b/lldb/source/Core/DynamicLoader.cpp
index 9c6ca1e5f910c..76c71d2a49a48 100644
--- a/lldb/source/Core/DynamicLoader.cpp
+++ b/lldb/source/Core/DynamicLoader.cpp
@@ -328,7 +328,7 @@ ModuleSP DynamicLoader::LoadBinaryWithUUIDAndAddress(
     }
   } else {
     if (force_symbol_search) {
-      lldb::StreamSP s = target.GetDebugger().GetAsyncErrorStream();
+      lldb::StreamUP s = target.GetDebugger().GetAsyncErrorStream();
       s->Printf("Unable to find file");
       if (!name.empty())
         s->Printf(" %s", name.str().c_str());
diff --git a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
index 1d4cda6c046b7..60724f3900ae7 100644
--- a/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
+++ b/lldb/source/Plugins/DynamicLoader/Darwin-Kernel/DynamicLoaderDarwinKernel.cpp
@@ -738,7 +738,7 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule(
   }
 
   if (IsKernel() && m_uuid.IsValid()) {
-    lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream();
+    lldb::StreamUP s = target.GetDebugger().GetAsyncOutputStream();
     s->Printf("Kernel UUID: %s\n", m_uuid.GetAsString().c_str());
     s->Printf("Load Address: 0x%" PRIx64 "\n", m_load_address);
 
@@ -830,7 +830,7 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule(
       }
 
       if (IsKernel() && !m_module_sp) {
-        lldb::StreamSP s = target.GetDebugger().GetAsyncErrorStream();
+        lldb::StreamUP s = target.GetDebugger().GetAsyncErrorStream();
         s->Printf("WARNING: Unable to locate kernel binary on the debugger "
                   "system.\n");
         if (kernel_search_error.Fail() && kernel_search_error.AsCString("") &&
@@ -974,7 +974,7 @@ bool DynamicLoaderDarwinKernel::KextImageInfo::LoadImageUsingMemoryModule(
   bool is_loaded = IsLoaded();
 
   if (is_loaded && m_module_sp && IsKernel()) {
-    lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream();
+    lldb::StreamUP s = target.GetDebugger().GetAsyncOutputStream();
     ObjectFile *kernel_object_file = m_module_sp->GetObjectFile();
     if (kernel_object_file) {
       addr_t file_address =
diff --git a/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp
index 3bf0a46de57af..a23ba3ad5c545 100644
--- a/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp
+++ b/lldb/source/Plugins/DynamicLoader/FreeBSD-Kernel/DynamicLoaderFreeBSDKernel.cpp
@@ -327,7 +327,7 @@ bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingMemoryModule(
   Target &target = process->GetTarget();
 
   if (IsKernel() && m_uuid.IsValid()) {
-    lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream();
+    lldb::StreamUP s = target.GetDebugger().GetAsyncOutputStream();
     s->Printf("Kernel UUID: %s\n", m_uuid.GetAsString().c_str());
     s->Printf("Load Address: 0x%" PRIx64 "\n", m_load_address);
   }
@@ -355,9 +355,9 @@ bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingMemoryModule(
       if (!m_module_sp)
         m_module_sp = target.GetOrCreateModule(module_spec, true);
       if (IsKernel() && !m_module_sp) {
-        lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream();
-        s->Printf("WARNING: Unable to locate kernel binary on the debugger "
-                  "system.\n");
+        target.GetDebugger().GetAsyncOutputStream()->Printf(
+            "WARNING: Unable to locate kernel binary on the debugger "
+            "system.\n");
       }
     }
 
@@ -464,7 +464,7 @@ bool DynamicLoaderFreeBSDKernel::KModImageInfo::LoadImageUsingMemoryModule(
   }
 
   if (IsLoaded() && m_module_sp && IsKernel()) {
-    lldb::StreamSP s = target.GetDebugger().GetAsyncOutputStream();
+    lldb::StreamUP s = target.GetDebugger().GetAsyncOutputStream();
     ObjectFile *kernel_object_file = m_module_sp->GetObjectFile();
     if (kernel_object_file) {
       addr_t file_address =
diff --git a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
index 9b2907c680996..406e1d45dc39a 100644
--- a/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
+++ b/lldb/source/Plugins/Process/MacOSX-Kernel/ProcessKDP.cpp
@@ -321,20 +321,10 @@ Status ProcessKDP::DoConnectRemote(llvm::StringRef remote_url) {
           SetID(1);
           GetThreadList();
           SetPrivateState(eStateStopped);
-          StreamSP async_strm_sp(target.GetDebugger().GetAsyncOutputStream());
-          if (async_strm_sp) {
-            const char *cstr;
-            if ((cstr = m_comm.GetKernelVersion()) != NULL) {
-              async_strm_sp->Printf("Version: %s\n", cstr);
-              async_strm_sp->Flush();
-            }
-            //                      if ((cstr = m_comm.GetImagePath ()) != NULL)
-            //                      {
-            //                          async_strm_sp->Printf ("Image Path:
-            //                          %s\n", cstr);
-            //                          async_strm_sp->Flush();
-            //                      }
-          }
+          const char *cstr;
+          if ((cstr = m_comm.GetKernelVersion()) != NULL)
+            target.GetDebugger().GetAsyncOutputStream()->Printf("Version: %s\n",
+                                                                cstr);
         } else {
           return Status::FromErrorString("KDP_REATTACH failed");
         }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index f36595145e035..8a8c0f92fbbc2 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -5495,8 +5495,7 @@ class CommandObjectProcessGDBRemoteSpeedTest : public CommandObjectParsed {
       if (process) {
         StreamSP output_stream_sp = result.GetImmediateOutputStream();
         if (!output_stream_sp)
-          output_stream_sp =
-              StreamSP(m_interpreter.GetDebugger().GetAsyncOutputStream());
+          output_stream_sp = m_interpreter.GetDebugger().GetAsyncOutputStream();
         result.SetImmediateOutputStream(output_stream_sp);
 
         const uint32_t num_packets =
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 0041c8f2b2db2..6db582096155f 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -2743,10 +2743,9 @@ Status Process::LaunchPrivate(ProcessLaunchInfo &launch_info, StateType &state,
 
     // Now that we know the process type, update its signal responses from the
     // ones stored in the Target:
-    if (m_unix_signals_sp) {
-      StreamSP warning_strm = GetTarget().GetDebugger().GetAsyncErrorStream();
-      GetTarget().UpdateSignalsFromDummy(m_unix_signals_sp, warning_strm);
-    }
+    if (m_unix_signals_sp)
+      GetTarget().UpdateSignalsFromDummy(
+          m_unix_signals_sp, GetTarget().GetDebugger().GetAsyncErrorStream());
 
     DynamicLoader *dyld = GetDynamicLoader();
     if (dyld)
@@ -3131,10 +3130,9 @@ void Process::CompleteAttach() {
   }
   // Now that we know the process type, update its signal responses from the
   // ones stored in the Target:
-  if (m_unix_signals_sp) {
-    StreamSP warning_strm = GetTarget().GetDebugger().GetAsyncErrorStream();
-    GetTarget().UpdateSignalsFromDummy(m_unix_signals_sp, warning_strm);
-  }
+  if (m_unix_signals_sp)
+    GetTarget().UpdateSignalsFromDummy(
+        m_unix_signals_sp, GetTarget().GetDebugger().GetAsyncErrorStream());
 
   // We have completed the attach, now it is time to find the dynamic loader
   // plug-in
diff --git a/lldb/source/Target/StopInfo.cpp b/lldb/source/Target/StopInfo.cpp
index 092d78d87a2b1..1c9ecbfe70c3c 100644
--- a/lldb/source/Target/StopInfo.cpp
+++ b/lldb/source/Target/StopInfo.cpp
@@ -1016,11 +1016,9 @@ class StopInfoWatchpoint : public StopInfo {
           wp_sp->CaptureWatchedValue(exe_ctx);
 
           Debugger &debugger = exe_ctx.GetTargetRef().GetDebugger();
-          StreamSP output_sp = debugger.GetAsyncOutputStream();
-          if (wp_sp->DumpSnapshots(output_sp.get())) {
-            output_sp->EOL();
-            output_sp->Flush();
-          }
+          StreamUP output_up = debugger.GetAsyncOutputStream();
+          if (wp_sp->DumpSnapshots(output_up.get()))
+            output_up->EOL();
         }
 
       } else {
diff --git a/llvm/include/llvm/CodeGen/RDFRegisters.h b/llvm/include/llvm/CodeGen/RDFRegisters.h
index 174ee115a1501..cc30b977ae421 100644
--- a/llvm/include/llvm/CodeGen/RDFRegisters.h
+++ b/llvm/include/llvm/CodeGen/RDFRegisters.h
@@ -111,7 +111,7 @@ struct RegisterRef {
   }
 
   static constexpr bool isRegId(unsigned Id) {
-    return Register::isPhysicalRegister(Id);
+    return Register(Id).isPhysical();
   }
   static constexpr bool isUnitId(unsigned Id) {
     return Register(Id).isVirtual();
diff --git a/llvm/include/llvm/CodeGen/Register.h b/llvm/include/llvm/CodeGen/Register.h
index 03e462872d3c2..6c02ffef89363 100644
--- a/llvm/include/llvm/CodeGen/Register.h
+++ b/llvm/include/llvm/CodeGen/Register.h
@@ -48,12 +48,6 @@ class Register {
     return Register(FI + MCRegister::FirstStackSlot);
   }
 
-  /// Return true if the specified register number is in
-  /// the physical register namespace.
-  static constexpr bool isPhysicalRegister(unsigned Reg) {
-    return MCRegister::isPhysicalRegister(Reg);
-  }
-
   /// Convert a 0-based index to a virtual register number.
   /// This is the inverse operation of VirtReg2IndexFunctor below.
   static Register index2VirtReg(unsigned Index) {
@@ -67,7 +61,9 @@ class Register {
 
   /// Return true if the specified register number is in the physical register
   /// namespace.
-  constexpr bool isPhysical() const { return isPhysicalRegister(Reg); }
+  constexpr bool isPhysical() const {
+    return MCRegister::isPhysicalRegister(Reg);
+  }
 
   /// Convert a virtual register number to a 0-based index. The first virtual
   /// register in a function will get the index 0.
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
index 9bdf940fc77b7..4385df518a111 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h
@@ -23,57 +23,54 @@
 
 namespace llvm::sandboxir {
 
+class LegalityResult;
+
+struct Action {
+  unsigned Idx = 0;
+  const LegalityResult *LegalityRes = nullptr;
+  SmallVector<Value *, 4> Bndl;
+  SmallVector<Value *> UserBndl;
+  unsigned Depth;
+  SmallVector<Action *> Operands;
+  Value *Vec = nullptr;
+  Action(const LegalityResult *LR, ArrayRef<Value *> B, ArrayRef<Value *> UB,
+         unsigned Depth)
+      : LegalityRes(LR), Bndl(B), UserBndl(UB), Depth(Depth) {}
+#ifndef NDEBUG
+  void print(raw_ostream &OS) const;
+  void dump() const;
+  friend raw_ostream &operator<<(raw_ostream &OS, const Action &A) {
+    A.print(OS);
+    return OS;
+  }
+#endif // NDEBUG
+};
+
 /// Maps the original instructions to the vectorized instrs and the reverse.
 /// For now an original instr can only map to a single vector.
 class InstrMaps {
   /// A map from the original values that got combined into vectors, to the
-  /// vector value(s).
-  DenseMap<Value *, Value *> OrigToVectorMap;
-  /// A map from the vector value to a map of the original value to its lane.
+  /// vectorization Action.
+  DenseMap<Value *, Action *> OrigToVectorMap;
+  /// A map from the vec Action to a map of the original value to its lane.
   /// Please note that for constant vectors, there may multiple original values
   /// with the same lane, as they may be coming from vectorizing different
   /// original values.
-  DenseMap<Value *, DenseMap<Value *, unsigned>> VectorToOrigLaneMap;
-  Context &Ctx;
+  DenseMap<Action *, DenseMap<Value *, unsigned>> VectorToOrigLaneMap;
   std::optional<Context::CallbackID> EraseInstrCB;
 
-private:
-  void notifyEraseInstr(Value *V) {
-    // We don't know if V is an original or a vector value.
-    auto It = OrigToVectorMap.find(V);
-    if (It != OrigToVectorMap.end()) {
-      // V is an original value.
-      // Remove it from VectorToOrigLaneMap.
-      Value *Vec = It->second;
-      VectorToOrigLaneMap[Vec].erase(V);
-      // Now erase V from OrigToVectorMap.
-      OrigToVectorMap.erase(It);
-    } else {
-      // V is a vector value.
-      // Go over the original values it came from and remove them from
-      // OrigToVectorMap.
-      for (auto [Orig, Lane] : VectorToOrigLaneMap[V])
-        OrigToVectorMap.erase(Orig);
-      // Now erase V from VectorToOrigLaneMap.
-      VectorToOrigLaneMap.erase(V);
-    }
-  }
-
 public:
-  InstrMaps(Context &Ctx) : Ctx(Ctx) {
-    EraseInstrCB = Ctx.registerEraseInstrCallback(
-        [this](Instruction *I) { notifyEraseInstr(I); });
-  }
-  ~InstrMaps() { Ctx.unregisterEraseInstrCallback(*EraseInstrCB); }
+  InstrMaps() = default;
+  ~InstrMaps() = default;
   /// \Returns the vector value that we got from vectorizing \p Orig, or
   /// nullptr if not found.
-  Value *getVectorForOrig(Value *Orig) const {
+  Action *getVectorForOrig(Value *Orig) const {
     auto It = OrigToVectorMap.find(Orig);
     return It != OrigToVectorMap.end() ? It->second : nullptr;
   }
   /// \Returns the lane of \p Orig before it got vectorized into \p Vec, or
   /// nullopt if not found.
-  std::optional<unsigned> getOrigLane(Value *Vec, Value *Orig) const {
+  std::optional<unsigned> getOrigLane(Action *Vec, Value *Orig) const {
     auto It1 = VectorToOrigLaneMap.find(Vec);
     if (It1 == VectorToOrigLaneMap.end())
       return std::nullopt;
@@ -84,7 +81,7 @@ class InstrMaps {
     return It2->second;
   }
   /// Update the map to reflect that \p Origs got vectorized into \p Vec.
-  void registerVector(ArrayRef<Value *> Origs, Value *Vec) {
+  void registerVector(ArrayRef<Value *> Origs, Action *Vec) {
     auto &OrigToLaneMap = VectorToOrigLaneMap[Vec];
     unsigned Lane = 0;
     for (Value *Orig : Origs) {
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
index 132b12a7b4e6c..bc2942f87adcf 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h
@@ -17,6 +17,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h"
 
 namespace llvm::sandboxir {
@@ -206,22 +207,22 @@ class Widen final : public LegalityResult {
 
 class DiamondReuse final : public LegalityResult {
   friend class LegalityAnalysis;
-  Value *Vec;
-  DiamondReuse(Value *Vec)
+  Action *Vec;
+  DiamondReuse(Action *Vec)
       : LegalityResult(LegalityResultID::DiamondReuse), Vec(Vec) {}
 
 public:
   static bool classof(const LegalityResult *From) {
     return From->getSubclassID() == LegalityResultID::DiamondReuse;
   }
-  Value *getVector() const { return Vec; }
+  Action *getVector() const { return Vec; }
 };
 
 class DiamondReuseWithShuffle final : public LegalityResult {
   friend class LegalityAnalysis;
-  Value *Vec;
+  Action *Vec;
   ShuffleMask Mask;
-  DiamondReuseWithShuffle(Value *Vec, const ShuffleMask &Mask)
+  DiamondReuseWithShuffle(Action *Vec, const ShuffleMask &Mask)
       : LegalityResult(LegalityResultID::DiamondReuseWithShuffle), Vec(Vec),
         Mask(Mask) {}
 
@@ -229,7 +230,7 @@ class DiamondReuseWithShuffle final : public LegalityResult {
   static bool classof(const LegalityResult *From) {
     return From->getSubclassID() == LegalityResultID::DiamondReuseWithShuffle;
   }
-  Value *getVector() const { return Vec; }
+  Action *getVector() const { return Vec; }
   const ShuffleMask &getMask() const { return Mask; }
 };
 
@@ -250,18 +251,18 @@ class CollectDescr {
   /// Describes how to get a value element. If the value is a vector then it
   /// also provides the index to extract it from.
   class ExtractElementDescr {
-    Value *V;
+    PointerUnion<Action *, Value *> V = nullptr;
     /// The index in `V` that the value can be extracted from.
-    /// This is nullopt if we need to use `V` as a whole.
-    std::optional<int> ExtractIdx;
+    int ExtractIdx = 0;
 
   public:
-    ExtractElementDescr(Value *V, int ExtractIdx)
+    ExtractElementDescr(Action *V, int ExtractIdx)
         : V(V), ExtractIdx(ExtractIdx) {}
-    ExtractElementDescr(Value *V) : V(V), ExtractIdx(std::nullopt) {}
-    Value *getValue() const { return V; }
-    bool needsExtract() const { return ExtractIdx.has_value(); }
-    int getExtractIdx() const { return *ExtractIdx; }
+    ExtractElementDescr(Value *V) : V(V) {}
+    Action *getValue() const { return cast<Action *>(V); }
+    Value *getScalar() const { return cast<Value *>(V); }
+    bool needsExtract() const { return isa<Action *>(V); }
+    int getExtractIdx() const { return ExtractIdx; }
   };
 
   using DescrVecT = SmallVector<ExtractElementDescr, 4>;
@@ -272,11 +273,11 @@ class CollectDescr {
       : Descrs(std::move(Descrs)) {}
   /// If all elements come from a single vector input, then return that vector
   /// and also the shuffle mask required to get them in order.
-  std::optional<std::pair<Value *, ShuffleMask>> getSingleInput() const {
+  std::optional<std::pair<Action *, ShuffleMask>> getSingleInput() const {
     const auto &Descr0 = *Descrs.begin();
-    Value *V0 = Descr0.getValue();
     if (!Descr0.needsExtract())
       return std::nullopt;
+    auto *V0 = Descr0.getValue();
     ShuffleMask::IndicesVecT MaskIndices;
     MaskIndices.push_back(Descr0.getExtractIdx());
     for (const auto &Descr : drop_begin(Descrs)) {
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index daf6499213d48..b28e9948d6f55 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -58,9 +58,33 @@ class BottomUpVec final : public RegionPass {
   /// function helps collect these instructions (along with the pointer operands
   /// for loads/stores) so that they can be cleaned up later.
   void collectPotentiallyDeadInstrs(ArrayRef<Value *> Bndl);
-  /// Recursively try to vectorize \p Bndl and its operands.
-  Value *vectorizeRec(ArrayRef<Value *> Bndl, ArrayRef<Value *> UserBndl,
-                      unsigned Depth);
+
+  /// Helper class describing how(if) to vectorize the code.
+  class ActionsVector {
+  private:
+    SmallVector<std::unique_ptr<Action>, 16> Actions;
+
+  public:
+    auto begin() const { return Actions.begin(); }
+    auto end() const { return Actions.end(); }
+    void push_back(std::unique_ptr<Action> &&ActPtr) {
+      ActPtr->Idx = Actions.size();
+      Actions.push_back(std::move(ActPtr));
+    }
+    void clear() { Actions.clear(); }
+#ifndef NDEBUG
+    void print(raw_ostream &OS) const;
+    void dump() const;
+#endif // NDEBUG
+  };
+  ActionsVector Actions;
+  /// Recursively try to vectorize \p Bndl and its operands. This populates the
+  /// `Actions` vector.
+  Action *vectorizeRec(ArrayRef<Value *> Bndl, ArrayRef<Value *> UserBndl,
+                       unsigned Depth);
+  /// Generate vector instructions based on `Actions` and return the last vector
+  /// created.
+  Value *emitVectors();
   /// Entry point for vectorization starting from \p Seeds.
   bool tryVectorize(ArrayRef<Value *> Seeds);
 
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index cab70c5c01a45..a1d91de3bb788 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -677,7 +677,7 @@ class AccessAnalysis {
                             const DenseMap<Value *, const SCEV *> &Strides,
                             DenseMap<Value *, unsigned> &DepSetId,
                             Loop *TheLoop, unsigned &RunningDepId,
-                            unsigned ASId, bool ShouldCheckStride, bool Assume);
+                            unsigned ASId, bool Assume);
 
   /// Check whether we can check the pointers at runtime for
   /// non-intersection.
@@ -685,8 +685,9 @@ class AccessAnalysis {
   /// Returns true if we need no check or if we do and we can generate them
   /// (i.e. the pointers have computable bounds).
   bool canCheckPtrAtRT(RuntimePointerChecking &RtCheck, ScalarEvolution *SE,
-                       Loop *TheLoop, const DenseMap<Value *, const SCEV *> &Strides,
-                       Value *&UncomputablePtr, bool ShouldCheckWrap = false);
+                       Loop *TheLoop,
+                       const DenseMap<Value *, const SCEV *> &Strides,
+                       Value *&UncomputablePtr);
 
   /// Goes over all memory accesses, checks whether a RT check is needed
   /// and builds sets of dependent accesses.
@@ -1115,13 +1116,11 @@ findForkedPointer(PredicatedScalarEvolution &PSE,
   return {{replaceSymbolicStrideSCEV(PSE, StridesMap, Ptr), false}};
 }
 
-bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
-                                          MemAccessInfo Access, Type *AccessTy,
-                                          const DenseMap<Value *, const SCEV *> &StridesMap,
-                                          DenseMap<Value *, unsigned> &DepSetId,
-                                          Loop *TheLoop, unsigned &RunningDepId,
-                                          unsigned ASId, bool ShouldCheckWrap,
-                                          bool Assume) {
+bool AccessAnalysis::createCheckForAccess(
+    RuntimePointerChecking &RtCheck, MemAccessInfo Access, Type *AccessTy,
+    const DenseMap<Value *, const SCEV *> &StridesMap,
+    DenseMap<Value *, unsigned> &DepSetId, Loop *TheLoop,
+    unsigned &RunningDepId, unsigned ASId, bool Assume) {
   Value *Ptr = Access.getPointer();
 
   SmallVector<PointerIntPair<const SCEV *, 1, bool>> TranslatedPtrs =
@@ -1152,8 +1151,7 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
 
     // When we run after a failing dependency check we have to make sure
     // we don't have wrapping pointers.
-    if (ShouldCheckWrap &&
-        !isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
+    if (!isNoWrap(PSE, AR, TranslatedPtrs.size() == 1 ? Ptr : nullptr, AccessTy,
                   TheLoop, Assume)) {
       return false;
     }
@@ -1182,10 +1180,10 @@ bool AccessAnalysis::createCheckForAccess(RuntimePointerChecking &RtCheck,
   return true;
 }
 
-bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
-                                     ScalarEvolution *SE, Loop *TheLoop,
-                                     const DenseMap<Value *, const SCEV *> &StridesMap,
-                                     Value *&UncomputablePtr, bool ShouldCheckWrap) {
+bool AccessAnalysis::canCheckPtrAtRT(
+    RuntimePointerChecking &RtCheck, ScalarEvolution *SE, Loop *TheLoop,
+    const DenseMap<Value *, const SCEV *> &StridesMap,
+    Value *&UncomputablePtr) {
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
@@ -1245,7 +1243,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       for (const auto &AccessTy : Accesses[Access]) {
         if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
                                   DepSetId, TheLoop, RunningDepId, ASId,
-                                  ShouldCheckWrap, false)) {
+                                  false)) {
           LLVM_DEBUG(dbgs() << "LAA: Can't find bounds for ptr:"
                             << *Access.getPointer() << '\n');
           Retries.emplace_back(Access, AccessTy);
@@ -1275,7 +1273,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
       for (const auto &[Access, AccessTy] : Retries) {
         if (!createCheckForAccess(RtCheck, Access, AccessTy, StridesMap,
                                   DepSetId, TheLoop, RunningDepId, ASId,
-                                  ShouldCheckWrap, /*Assume=*/true)) {
+                                  /*Assume=*/true)) {
           CanDoAliasSetRT = false;
           UncomputablePtr = Access.getPointer();
           break;
@@ -1431,8 +1429,8 @@ void AccessAnalysis::processMemAccesses() {
           typedef SmallVector<const Value *, 16> ValueVector;
           ValueVector TempObjects;
 
-          UnderlyingObjects[Ptr] = {};
           SmallVector<const Value *, 16> &UOs = UnderlyingObjects[Ptr];
+          UOs = {};
           ::getUnderlyingObjects(Ptr, UOs, LI);
           LLVM_DEBUG(dbgs()
                      << "Underlying objects for pointer " << *Ptr << "\n");
@@ -2643,9 +2641,8 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   Value *UncomputablePtr = nullptr;
-  bool CanDoRTIfNeeded =
-      Accesses.canCheckPtrAtRT(*PtrRtChecking, PSE->getSE(), TheLoop,
-                               SymbolicStrides, UncomputablePtr, false);
+  bool CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
+      *PtrRtChecking, PSE->getSE(), TheLoop, SymbolicStrides, UncomputablePtr);
   if (!CanDoRTIfNeeded) {
     const auto *I = dyn_cast_or_null<Instruction>(UncomputablePtr);
     recordAnalysis("CantIdentifyArrayBounds", I)
@@ -2676,7 +2673,7 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
       auto *SE = PSE->getSE();
       UncomputablePtr = nullptr;
       CanDoRTIfNeeded = Accesses.canCheckPtrAtRT(
-          *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr, true);
+          *PtrRtChecking, SE, TheLoop, SymbolicStrides, UncomputablePtr);
 
       // Check that we found the bounds for the pointer.
       if (!CanDoRTIfNeeded) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index d87649c4e6567..0f11423a84930 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -525,7 +525,7 @@ void llvm::calculateDbgEntityHistory(const MachineFunction *MF,
           // Don't consider SP to be clobbered by register masks.
           for (auto It : RegVars) {
             unsigned int Reg = It.first;
-            if (Reg != SP && Register::isPhysicalRegister(Reg) &&
+            if (Reg != SP && Register(Reg).isPhysical() &&
                 MO.clobbersPhysReg(Reg))
               RegsToClobber.push_back(Reg);
           }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index ddf0275ddfe6a..cf3673058c8e7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -564,7 +564,7 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP,
         TFI->getDwarfFrameBase(*Asm->MF);
     switch (FrameBase.Kind) {
     case TargetFrameLowering::DwarfFrameBase::Register: {
-      if (Register::isPhysicalRegister(FrameBase.Location.Reg)) {
+      if (Register(FrameBase.Location.Reg).isPhysical()) {
         MachineLocation Location(FrameBase.Location.Reg);
         addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
       }
diff --git a/llvm/lib/CodeGen/LiveRangeCalc.cpp b/llvm/lib/CodeGen/LiveRangeCalc.cpp
index 1a9bc694ed0fd..a7c8c3fc8a25a 100644
--- a/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -216,7 +216,7 @@ bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
       report_fatal_error("Use not jointly dominated by defs.");
     }
 
-    if (Register::isPhysicalRegister(PhysReg)) {
+    if (Register(PhysReg).isPhysical()) {
       const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
       bool IsLiveIn = MBB->isLiveIn(PhysReg);
       for (MCRegAliasIterator Alias(PhysReg, TRI, false); !IsLiveIn && Alias.isValid(); ++Alias)
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 0da7535031a7d..1cc1b2cbd81b9 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -3966,8 +3966,7 @@ void GenericScheduler::reschedulePhysReg(SUnit *SU, bool isTop) {
   // Find already scheduled copies with a single physreg dependence and move
   // them just above the scheduled instruction.
   for (SDep &Dep : Deps) {
-    if (Dep.getKind() != SDep::Data ||
-        !Register::isPhysicalRegister(Dep.getReg()))
+    if (Dep.getKind() != SDep::Data || !Register(Dep.getReg()).isPhysical())
       continue;
     SUnit *DepSU = Dep.getSUnit();
     if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 14128dafbe4ee..2809056bfeba2 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -708,7 +708,7 @@ void RegAllocFastImpl::reloadAtBegin(MachineBasicBlock &MBB) {
 /// not used by a virtreg. Kill the physreg, marking it free. This may add
 /// implicit kills to MO->getParent() and invalidate MO.
 bool RegAllocFastImpl::usePhysReg(MachineInstr &MI, MCPhysReg Reg) {
-  assert(Register::isPhysicalRegister(Reg) && "expected physreg");
+  assert(Register(Reg).isPhysical() && "expected physreg");
   bool displacedAny = displacePhysReg(MI, Reg);
   setPhysRegState(Reg, regPreAssigned);
   markRegUsedInInstr(Reg);
@@ -1289,7 +1289,7 @@ void RegAllocFastImpl::dumpState() const {
     assert(VirtReg.isVirtual() && "Bad map key");
     MCPhysReg PhysReg = LR.PhysReg;
     if (PhysReg != 0) {
-      assert(Register::isPhysicalRegister(PhysReg) && "mapped to physreg");
+      assert(Register(PhysReg).isPhysical() && "mapped to physreg");
       for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
         assert(RegUnitStates[Unit] == VirtReg && "inverse map valid");
       }
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index fd4641ec6f124..288b9d9553b1d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -501,8 +501,8 @@ bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
             F.isClobberKind()) {
           // Check for def of register or earlyclobber register.
           for (; NumVals; --NumVals, ++i) {
-            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
-            if (Register::isPhysicalRegister(Reg))
+            Register Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (Reg.isPhysical())
               CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
           }
         } else
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 133ac6b1327dd..a76498fcab8f2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -10125,9 +10125,8 @@ void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call,
     auto DetectWriteToReservedRegister = [&]() {
       const MachineFunction &MF = DAG.getMachineFunction();
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
-      for (unsigned Reg : OpInfo.AssignedRegs.Regs) {
-        if (Register::isPhysicalRegister(Reg) &&
-            TRI.isInlineAsmReadOnlyReg(MF, Reg)) {
+      for (Register Reg : OpInfo.AssignedRegs.Regs) {
+        if (Reg.isPhysical() && TRI.isInlineAsmReadOnlyReg(MF, Reg)) {
           const char *RegName = TRI.getName(Reg);
           emitInlineAsmError(Call, "write to reserved register '" +
                                        Twine(RegName) + "'");
@@ -11389,7 +11388,7 @@ void SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V,
   assert((Op.getOpcode() != ISD::CopyFromReg ||
           cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
          "Copy from a reg to the same reg!");
-  assert(!Register::isPhysicalRegister(Reg) && "Is a physreg");
+  assert(!Register(Reg).isPhysical() && "Is a physreg");
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   // If this is an InlineAsm we have to match the registers required, not the
diff --git a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
index f3a9fb188f51d..5d7d6a1141ba0 100644
--- a/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/XCOFFEmitter.cpp
@@ -145,14 +145,16 @@ bool XCOFFWriter::initSectionHeaders(uint64_t &CurrentOffset) {
   uint64_t CurrentEndTDataAddr = 0;
   for (uint16_t I = 0, E = InitSections.size(); I < E; ++I) {
     // Assign indices for sections.
-    if (InitSections[I].SectionName.size() &&
-        !SectionIndexMap[InitSections[I].SectionName]) {
-      // The section index starts from 1.
-      SectionIndexMap[InitSections[I].SectionName] = I + 1;
-      if ((I + 1) > MaxSectionIndex) {
-        ErrHandler("exceeded the maximum permitted section index of " +
-                   Twine(MaxSectionIndex));
-        return false;
+    if (InitSections[I].SectionName.size()) {
+      int16_t &SectionIndex = SectionIndexMap[InitSections[I].SectionName];
+      if (!SectionIndex) {
+        // The section index starts from 1.
+        SectionIndex = I + 1;
+        if ((I + 1) > MaxSectionIndex) {
+          ErrHandler("exceeded the maximum permitted section index of " +
+                     Twine(MaxSectionIndex));
+          return false;
+        }
       }
     }
 
@@ -779,19 +781,19 @@ bool XCOFFWriter::writeSymbols() {
       W.write<uint32_t>(YamlSym.Value);
     }
     if (YamlSym.SectionName) {
-      if (!SectionIndexMap.count(*YamlSym.SectionName)) {
+      auto It = SectionIndexMap.find(*YamlSym.SectionName);
+      if (It == SectionIndexMap.end()) {
         ErrHandler("the SectionName " + *YamlSym.SectionName +
                    " specified in the symbol does not exist");
         return false;
       }
-      if (YamlSym.SectionIndex &&
-          SectionIndexMap[*YamlSym.SectionName] != *YamlSym.SectionIndex) {
+      if (YamlSym.SectionIndex && It->second != *YamlSym.SectionIndex) {
         ErrHandler("the SectionName " + *YamlSym.SectionName +
                    " and the SectionIndex (" + Twine(*YamlSym.SectionIndex) +
                    ") refer to different sections");
         return false;
       }
-      W.write<int16_t>(SectionIndexMap[*YamlSym.SectionName]);
+      W.write<int16_t>(It->second);
     } else {
       W.write<int16_t>(YamlSym.SectionIndex.value_or(0));
     }
diff --git a/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp b/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
index 174438c1863dd..c636719d86ca0 100644
--- a/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PBQPRegAlloc.cpp
@@ -155,11 +155,11 @@ bool A57ChainingConstraint::addIntraChainConstraint(PBQPRAGraph &G, unsigned Rd,
 
   LiveIntervals &LIs = G.getMetadata().LIS;
 
-  if (Register::isPhysicalRegister(Rd) || Register::isPhysicalRegister(Ra)) {
-    LLVM_DEBUG(dbgs() << "Rd is a physical reg:"
-                      << Register::isPhysicalRegister(Rd) << '\n');
-    LLVM_DEBUG(dbgs() << "Ra is a physical reg:"
-                      << Register::isPhysicalRegister(Ra) << '\n');
+  if (Register(Rd).isPhysical() || Register(Ra).isPhysical()) {
+    LLVM_DEBUG(dbgs() << "Rd is a physical reg:" << Register(Rd).isPhysical()
+                      << '\n');
+    LLVM_DEBUG(dbgs() << "Ra is a physical reg:" << Register(Ra).isPhysical()
+                      << '\n');
     return false;
   }
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 839b7e81f8998..9a021925a6bd1 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1108,7 +1108,7 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
   if (!SubIdx)
     return MIB.addReg(Reg, State);
 
-  if (Register::isPhysicalRegister(Reg))
+  if (Register(Reg).isPhysical())
     return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
   return MIB.addReg(Reg, State, SubIdx);
 }
diff --git a/llvm/lib/Target/ARM/ARMLatencyMutations.cpp b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
index 30e7ede68d787..601b3fa19978d 100644
--- a/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
+++ b/llvm/lib/Target/ARM/ARMLatencyMutations.cpp
@@ -802,7 +802,7 @@ signed M85Overrides::modifyMixedWidthFP(const MachineInstr *SrcMI,
             OP.getSubReg() == ARM::ssub_1)
           return 1;
     }
-  } else if (Register::isPhysicalRegister(RegID)) {
+  } else if (Register(RegID).isPhysical()) {
     // Note that when the producer is narrower, not all of the producers
     // may be present in the scheduling graph; somewhere earlier in the
     // compiler, an implicit def/use of the aliased full register gets
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 3b157006d9224..df182613d1661 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -223,8 +223,8 @@ static bool areCombinableOperations(const TargetRegisterInfo *TRI,
   return true;
 }
 
-static bool isEvenReg(unsigned Reg) {
-  assert(Register::isPhysicalRegister(Reg));
+static bool isEvenReg(Register Reg) {
+  assert(Reg.isPhysical());
   if (Hexagon::IntRegsRegClass.contains(Reg))
     return (Reg - Hexagon::R0) % 2 == 0;
   if (Hexagon::HvxVRRegClass.contains(Reg))
@@ -546,7 +546,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr &I1,
     // is even.
     bool IsI1LowReg = (I2DestReg - I1DestReg) == 1;
     bool IsI2LowReg = (I1DestReg - I2DestReg) == 1;
-    unsigned FirstRegIndex = IsI1LowReg ? I1DestReg : I2DestReg;
+    Register FirstRegIndex = IsI1LowReg ? I1DestReg : I2DestReg;
     if ((!IsI1LowReg && !IsI2LowReg) || !isEvenReg(FirstRegIndex))
       continue;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index ee01ebc4daa26..3bb7175bbf8b9 100644
--- a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -275,7 +275,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
       return false;
   }
 
-  unsigned cmpReg1, cmpOp2 = 0; // cmpOp2 assignment silences compiler warning.
+  Register cmpReg1, cmpOp2;
   cmpReg1 = MI.getOperand(1).getReg();
 
   if (secondReg) {
@@ -290,7 +290,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
     // at machine code level, we don't need this, but if we decide
     // to move new value jump prior to RA, we would be needing this.
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    if (!Register::isPhysicalRegister(cmpOp2)) {
+    if (!cmpOp2.isPhysical()) {
       MachineInstr *def = MRI.getVRegDef(cmpOp2);
       if (def->getOpcode() == TargetOpcode::COPY)
         return false;
@@ -480,7 +480,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     bool foundJump    = false;
     bool foundCompare = false;
     bool invertPredicate = false;
-    unsigned predReg = 0; // predicate reg of the jump.
+    Register predReg; // predicate reg of the jump.
     unsigned cmpReg1 = 0;
     int cmpOp2 = 0;
     MachineBasicBlock::iterator jmpPos;
@@ -516,7 +516,7 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
         jmpPos = MII;
         jmpInstr = &MI;
         predReg = MI.getOperand(0).getReg();
-        afterRA = Register::isPhysicalRegister(predReg);
+        afterRA = predReg.isPhysical();
 
         // If ifconverter had not messed up with the kill flags of the
         // operands, the following check on the kill flag would suffice.
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index fafdad08909dd..76177901f658a 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -44,8 +44,8 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
       const MachineOperand &Src = MI->getOperand(1);
       RegisterRef DstR = DFG.makeRegRef(Dst.getReg(), Dst.getSubReg());
       RegisterRef SrcR = DFG.makeRegRef(Src.getReg(), Src.getSubReg());
-      assert(Register::isPhysicalRegister(DstR.Reg));
-      assert(Register::isPhysicalRegister(SrcR.Reg));
+      assert(Register(DstR.Reg).isPhysical());
+      assert(Register(SrcR.Reg).isPhysical());
       const TargetRegisterInfo &TRI = DFG.getTRI();
       if (TRI.getMinimalPhysRegClass(DstR.Reg) !=
           TRI.getMinimalPhysRegClass(SrcR.Reg))
diff --git a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
index 62fb72ba4fd5e..5375d4484a7ab 100644
--- a/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
+++ b/llvm/lib/Target/M68k/M68kRegisterInfo.cpp
@@ -83,8 +83,7 @@ M68kRegisterInfo::getMatchingMegaReg(unsigned Reg,
 
 const TargetRegisterClass *
 M68kRegisterInfo::getMaximalPhysRegClass(unsigned reg, MVT VT) const {
-  assert(Register::isPhysicalRegister(reg) &&
-         "reg must be a physical register");
+  assert(Register(reg).isPhysical() && "reg must be a physical register");
 
   // Pick the most sub register class of the right type that contains
   // this physreg.
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
index 06ae8e1296e51..e40c85abc8b5d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMatInt.cpp
@@ -134,7 +134,7 @@ static void generateInstSeqImpl(int64_t Val, const MCSubtargetInfo &STI,
     }
 
     // Try to use SLLI_UW for Val when it is uint32 but not int32.
-    if (isUInt<32>((uint64_t)Val) && !isInt<32>((uint64_t)Val) &&
+    if (isUInt<32>(Val) && !isInt<32>(Val) &&
         STI.hasFeature(RISCV::FeatureStdExtZba)) {
       // Use LUI+ADDI or LUI to compose, then clear the upper 32 bits with
       // SLLI_UW.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
index cd84e68aed140..0469fbf15b251 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySortRegion.cpp
@@ -28,17 +28,17 @@ const SortRegion *SortRegionInfo::getRegionFor(const MachineBasicBlock *MBB) {
   // WE->contains(ML->getHeader()), but not ML->contains(WE->getHeader()).
   if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) {
     // If the smallest region containing MBB is a loop
-    if (LoopMap.count(ML))
-      return LoopMap[ML].get();
-    LoopMap[ML] = std::make_unique<ConcreteSortRegion<MachineLoop>>(ML);
-    return LoopMap[ML].get();
+    auto [It, Inserted] = LoopMap.try_emplace(ML);
+    if (Inserted)
+      It->second = std::make_unique<ConcreteSortRegion<MachineLoop>>(ML);
+    return It->second.get();
   } else {
     // If the smallest region containing MBB is an exception
-    if (ExceptionMap.count(WE))
-      return ExceptionMap[WE].get();
-    ExceptionMap[WE] =
-        std::make_unique<ConcreteSortRegion<WebAssemblyException>>(WE);
-    return ExceptionMap[WE].get();
+    auto [It, Inserted] = ExceptionMap.try_emplace(WE);
+    if (Inserted)
+      It->second =
+          std::make_unique<ConcreteSortRegion<WebAssemblyException>>(WE);
+    return It->second.get();
   }
 }
 
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index f2707afe195c4..56b7b8bfe1f66 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -1393,9 +1393,12 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
     CallerORE.emit(OR);
 
     // Now update the entry count:
-    if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) {
-      uint64_t CallSiteCount = CallSiteToProfCountMap[User];
-      CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+    if (CalleeEntryCountV) {
+      if (auto It = CallSiteToProfCountMap.find(User);
+          It != CallSiteToProfCountMap.end()) {
+        uint64_t CallSiteCount = It->second;
+        CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount);
+      }
     }
 
     AnyInline = true;
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
index 2d9a3d1f8a110..78b9c7d06e183 100644
--- a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -178,11 +178,9 @@ struct AssumeBuilderState {
     if (tryToPreserveWithoutAddingAssume(RK))
       return;
     MapKey Key{RK.WasOn, RK.AttrKind};
-    auto Lookup = AssumedKnowledgeMap.find(Key);
-    if (Lookup == AssumedKnowledgeMap.end()) {
-      AssumedKnowledgeMap[Key] = RK.ArgValue;
+    auto [Lookup, Inserted] = AssumedKnowledgeMap.try_emplace(Key, RK.ArgValue);
+    if (Inserted)
       return;
-    }
     assert(((Lookup->second == 0 && RK.ArgValue == 0) ||
             (Lookup->second != 0 && RK.ArgValue != 0)) &&
            "inconsistent argument value");
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/InstrMaps.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/InstrMaps.cpp
index 4df4829a04c41..37f1ec450f2eb 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/InstrMaps.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/InstrMaps.cpp
@@ -8,10 +8,23 @@
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/InstrMaps.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
 
 namespace llvm::sandboxir {
 
 #ifndef NDEBUG
+void Action::print(raw_ostream &OS) const {
+  OS << Idx << ". " << *LegalityRes << " Depth:" << Depth << "\n";
+  OS.indent(2) << "Bndl:\n";
+  for (Value *V : Bndl)
+    OS.indent(4) << *V << "\n";
+  OS.indent(2) << "UserBndl:\n";
+  for (Value *V : UserBndl)
+    OS.indent(4) << *V << "\n";
+}
+
+void Action::dump() const { print(dbgs()); }
+
 void InstrMaps::dump() const {
   print(dbgs());
   dbgs() << "\n";
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index d57732090dcd6..14438181f2602 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -156,12 +156,7 @@ Value *BottomUpVec::createVectorInstr(ArrayRef<Value *> Bndl,
     // TODO: Propagate debug info.
   };
 
-  auto *VecI = CreateVectorInstr(Bndl, Operands);
-  if (VecI != nullptr) {
-    Change = true;
-    IMaps->registerVector(Bndl, VecI);
-  }
-  return VecI;
+  return CreateVectorInstr(Bndl, Operands);
 }
 
 void BottomUpVec::tryEraseDeadInstrs() {
@@ -266,135 +261,196 @@ void BottomUpVec::collectPotentiallyDeadInstrs(ArrayRef<Value *> Bndl) {
   }
 }
 
-Value *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl,
-                                 ArrayRef<Value *> UserBndl, unsigned Depth) {
-  Value *NewVec = nullptr;
-  auto *UserBB = !UserBndl.empty()
-                     ? cast<Instruction>(UserBndl.front())->getParent()
-                     : cast<Instruction>(Bndl[0])->getParent();
+Action *BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl,
+                                  ArrayRef<Value *> UserBndl, unsigned Depth) {
   const auto &LegalityRes = Legality->canVectorize(Bndl);
+  auto ActionPtr =
+      std::make_unique<Action>(&LegalityRes, Bndl, UserBndl, Depth);
+  SmallVector<Action *> Operands;
   switch (LegalityRes.getSubclassID()) {
   case LegalityResultID::Widen: {
     auto *I = cast<Instruction>(Bndl[0]);
-    SmallVector<Value *, 2> VecOperands;
     switch (I->getOpcode()) {
     case Instruction::Opcode::Load:
-      // Don't recurse towards the pointer operand.
-      VecOperands.push_back(cast<LoadInst>(I)->getPointerOperand());
       break;
     case Instruction::Opcode::Store: {
       // Don't recurse towards the pointer operand.
-      auto *VecOp = vectorizeRec(getOperand(Bndl, 0), Bndl, Depth + 1);
-      VecOperands.push_back(VecOp);
-      VecOperands.push_back(cast<StoreInst>(I)->getPointerOperand());
+      Action *OpA = vectorizeRec(getOperand(Bndl, 0), Bndl, Depth + 1);
+      Operands.push_back(OpA);
       break;
     }
     default:
       // Visit all operands.
       for (auto OpIdx : seq<unsigned>(I->getNumOperands())) {
-        auto *VecOp = vectorizeRec(getOperand(Bndl, OpIdx), Bndl, Depth + 1);
-        VecOperands.push_back(VecOp);
+        Action *OpA = vectorizeRec(getOperand(Bndl, OpIdx), Bndl, Depth + 1);
+        Operands.push_back(OpA);
       }
       break;
     }
-    NewVec = createVectorInstr(Bndl, VecOperands);
-
-    // Collect any potentially dead scalar instructions, including the original
-    // scalars and pointer operands of loads/stores.
-    if (NewVec != nullptr)
-      collectPotentiallyDeadInstrs(Bndl);
+    // Update the maps to mark Bndl as "vectorized".
+    IMaps->registerVector(Bndl, ActionPtr.get());
     break;
   }
-  case LegalityResultID::DiamondReuse: {
-    NewVec = cast<DiamondReuse>(LegalityRes).getVector();
+  case LegalityResultID::DiamondReuse:
+  case LegalityResultID::DiamondReuseWithShuffle:
+  case LegalityResultID::DiamondReuseMultiInput:
+  case LegalityResultID::Pack:
     break;
   }
-  case LegalityResultID::DiamondReuseWithShuffle: {
-    auto *VecOp = cast<DiamondReuseWithShuffle>(LegalityRes).getVector();
-    const ShuffleMask &Mask =
-        cast<DiamondReuseWithShuffle>(LegalityRes).getMask();
-    NewVec = createShuffle(VecOp, Mask, UserBB);
-    assert(NewVec->getType() == VecOp->getType() &&
-           "Expected same type! Bad mask ?");
-    break;
+  // Create actions in post-order.
+  ActionPtr->Operands = std::move(Operands);
+  auto *Action = ActionPtr.get();
+  Actions.push_back(std::move(ActionPtr));
+  return Action;
+}
+
+#ifndef NDEBUG
+void BottomUpVec::ActionsVector::print(raw_ostream &OS) const {
+  for (auto [Idx, Action] : enumerate(Actions)) {
+    Action->print(OS);
+    OS << "\n";
   }
-  case LegalityResultID::DiamondReuseMultiInput: {
-    const auto &Descr =
-        cast<DiamondReuseMultiInput>(LegalityRes).getCollectDescr();
-    Type *ResTy = VecUtils::getWideType(Bndl[0]->getType(), Bndl.size());
+}
+void BottomUpVec::ActionsVector::dump() const { print(dbgs()); }
+#endif // NDEBUG
+
+Value *BottomUpVec::emitVectors() {
+  Value *NewVec = nullptr;
+  for (const auto &ActionPtr : Actions) {
+    ArrayRef<Value *> Bndl = ActionPtr->Bndl;
+    ArrayRef<Value *> UserBndl = ActionPtr->UserBndl;
+    const LegalityResult &LegalityRes = *ActionPtr->LegalityRes;
+    unsigned Depth = ActionPtr->Depth;
+    auto *UserBB = !UserBndl.empty()
+                       ? cast<Instruction>(UserBndl.front())->getParent()
+                       : cast<Instruction>(Bndl[0])->getParent();
 
-    // TODO: Try to get WhereIt without creating a vector.
-    SmallVector<Value *, 4> DescrInstrs;
-    for (const auto &ElmDescr : Descr.getDescrs()) {
-      if (auto *I = dyn_cast<Instruction>(ElmDescr.getValue()))
-        DescrInstrs.push_back(I);
+    switch (LegalityRes.getSubclassID()) {
+    case LegalityResultID::Widen: {
+      auto *I = cast<Instruction>(Bndl[0]);
+      SmallVector<Value *, 2> VecOperands;
+      switch (I->getOpcode()) {
+      case Instruction::Opcode::Load:
+        VecOperands.push_back(cast<LoadInst>(I)->getPointerOperand());
+        break;
+      case Instruction::Opcode::Store: {
+        VecOperands.push_back(ActionPtr->Operands[0]->Vec);
+        VecOperands.push_back(cast<StoreInst>(I)->getPointerOperand());
+        break;
+      }
+      default:
+        // Visit all operands.
+        for (Action *OpA : ActionPtr->Operands) {
+          auto *VecOp = OpA->Vec;
+          VecOperands.push_back(VecOp);
+        }
+        break;
+      }
+      NewVec = createVectorInstr(ActionPtr->Bndl, VecOperands);
+      // Collect any potentially dead scalar instructions, including the
+      // original scalars and pointer operands of loads/stores.
+      if (NewVec != nullptr)
+        collectPotentiallyDeadInstrs(Bndl);
+      break;
+    }
+    case LegalityResultID::DiamondReuse: {
+      NewVec = cast<DiamondReuse>(LegalityRes).getVector()->Vec;
+      break;
+    }
+    case LegalityResultID::DiamondReuseWithShuffle: {
+      auto *VecOp = cast<DiamondReuseWithShuffle>(LegalityRes).getVector()->Vec;
+      const ShuffleMask &Mask =
+          cast<DiamondReuseWithShuffle>(LegalityRes).getMask();
+      NewVec = createShuffle(VecOp, Mask, UserBB);
+      assert(NewVec->getType() == VecOp->getType() &&
+             "Expected same type! Bad mask ?");
+      break;
     }
-    BasicBlock::iterator WhereIt =
-        getInsertPointAfterInstrs(DescrInstrs, UserBB);
+    case LegalityResultID::DiamondReuseMultiInput: {
+      const auto &Descr =
+          cast<DiamondReuseMultiInput>(LegalityRes).getCollectDescr();
+      Type *ResTy = VecUtils::getWideType(Bndl[0]->getType(), Bndl.size());
 
-    Value *LastV = PoisonValue::get(ResTy);
-    unsigned Lane = 0;
-    for (const auto &ElmDescr : Descr.getDescrs()) {
-      Value *VecOp = ElmDescr.getValue();
-      Context &Ctx = VecOp->getContext();
-      Value *ValueToInsert;
-      if (ElmDescr.needsExtract()) {
-        ConstantInt *IdxC =
-            ConstantInt::get(Type::getInt32Ty(Ctx), ElmDescr.getExtractIdx());
-        ValueToInsert = ExtractElementInst::create(VecOp, IdxC, WhereIt,
-                                                   VecOp->getContext(), "VExt");
-      } else {
-        ValueToInsert = VecOp;
+      // TODO: Try to get WhereIt without creating a vector.
+      SmallVector<Value *, 4> DescrInstrs;
+      for (const auto &ElmDescr : Descr.getDescrs()) {
+        auto *V = ElmDescr.needsExtract() ? ElmDescr.getValue()->Vec
+                                          : ElmDescr.getScalar();
+        if (auto *I = dyn_cast<Instruction>(V))
+          DescrInstrs.push_back(I);
       }
-      auto NumLanesToInsert = VecUtils::getNumLanes(ValueToInsert);
-      if (NumLanesToInsert == 1) {
-        // If we are inserting a scalar element then we need a single insert.
-        //   %VIns = insert %DstVec,  %SrcScalar, Lane
-        ConstantInt *LaneC = ConstantInt::get(Type::getInt32Ty(Ctx), Lane);
-        LastV = InsertElementInst::create(LastV, ValueToInsert, LaneC, WhereIt,
-                                          Ctx, "VIns");
-      } else {
-        // If we are inserting a vector element then we need to extract and
-        // insert each vector element one by one with a chain of extracts and
-        // inserts, for example:
-        //   %VExt0 = extract %SrcVec, 0
-        //   %VIns0 = insert  %DstVec, %Vect0, Lane + 0
-        //   %VExt1 = extract %SrcVec, 1
-        //   %VIns1 = insert  %VIns0,  %Vect0, Lane + 1
-        for (unsigned LnCnt = 0; LnCnt != NumLanesToInsert; ++LnCnt) {
-          auto *ExtrIdxC = ConstantInt::get(Type::getInt32Ty(Ctx), LnCnt);
-          auto *ExtrI = ExtractElementInst::create(ValueToInsert, ExtrIdxC,
-                                                   WhereIt, Ctx, "VExt");
-          unsigned InsLane = Lane + LnCnt;
-          auto *InsLaneC = ConstantInt::get(Type::getInt32Ty(Ctx), InsLane);
-          LastV = InsertElementInst::create(LastV, ExtrI, InsLaneC, WhereIt,
-                                            Ctx, "VIns");
+      BasicBlock::iterator WhereIt =
+          getInsertPointAfterInstrs(DescrInstrs, UserBB);
+
+      Value *LastV = PoisonValue::get(ResTy);
+      Context &Ctx = LastV->getContext();
+      unsigned Lane = 0;
+      for (const auto &ElmDescr : Descr.getDescrs()) {
+        Value *VecOp = nullptr;
+        Value *ValueToInsert;
+        if (ElmDescr.needsExtract()) {
+          VecOp = ElmDescr.getValue()->Vec;
+          ConstantInt *IdxC =
+              ConstantInt::get(Type::getInt32Ty(Ctx), ElmDescr.getExtractIdx());
+          ValueToInsert = ExtractElementInst::create(
+              VecOp, IdxC, WhereIt, VecOp->getContext(), "VExt");
+        } else {
+          ValueToInsert = ElmDescr.getScalar();
+        }
+        auto NumLanesToInsert = VecUtils::getNumLanes(ValueToInsert);
+        if (NumLanesToInsert == 1) {
+          // If we are inserting a scalar element then we need a single insert.
+          //   %VIns = insert %DstVec,  %SrcScalar, Lane
+          ConstantInt *LaneC = ConstantInt::get(Type::getInt32Ty(Ctx), Lane);
+          LastV = InsertElementInst::create(LastV, ValueToInsert, LaneC,
+                                            WhereIt, Ctx, "VIns");
+        } else {
+          // If we are inserting a vector element then we need to extract and
+          // insert each vector element one by one with a chain of extracts and
+          // inserts, for example:
+          //   %VExt0 = extract %SrcVec, 0
+          //   %VIns0 = insert  %DstVec, %Vect0, Lane + 0
+          //   %VExt1 = extract %SrcVec, 1
+          //   %VIns1 = insert  %VIns0,  %Vect0, Lane + 1
+          for (unsigned LnCnt = 0; LnCnt != NumLanesToInsert; ++LnCnt) {
+            auto *ExtrIdxC = ConstantInt::get(Type::getInt32Ty(Ctx), LnCnt);
+            auto *ExtrI = ExtractElementInst::create(ValueToInsert, ExtrIdxC,
+                                                     WhereIt, Ctx, "VExt");
+            unsigned InsLane = Lane + LnCnt;
+            auto *InsLaneC = ConstantInt::get(Type::getInt32Ty(Ctx), InsLane);
+            LastV = InsertElementInst::create(LastV, ExtrI, InsLaneC, WhereIt,
+                                              Ctx, "VIns");
+          }
         }
+        Lane += NumLanesToInsert;
       }
-      Lane += NumLanesToInsert;
+      NewVec = LastV;
+      break;
+    }
+    case LegalityResultID::Pack: {
+      // If we can't vectorize the seeds then just return.
+      if (Depth == 0)
+        return nullptr;
+      NewVec = createPack(Bndl, UserBB);
+      break;
+    }
+    }
+    if (NewVec != nullptr) {
+      Change = true;
+      ActionPtr->Vec = NewVec;
     }
-    NewVec = LastV;
-    break;
-  }
-  case LegalityResultID::Pack: {
-    // If we can't vectorize the seeds then just return.
-    if (Depth == 0)
-      return nullptr;
-    NewVec = createPack(Bndl, UserBB);
-    break;
-  }
-  }
 #ifndef NDEBUG
-  if (AlwaysVerify) {
-    // This helps find broken IR by constantly verifying the function. Note that
-    // this is very expensive and should only be used for debugging.
-    Instruction *I0 = isa<Instruction>(Bndl[0])
-                          ? cast<Instruction>(Bndl[0])
-                          : cast<Instruction>(UserBndl[0]);
-    assert(!Utils::verifyFunction(I0->getParent()->getParent(), dbgs()) &&
-           "Broken function!");
+    if (AlwaysVerify) {
+      // This helps find broken IR by constantly verifying the function. Note
+      // that this is very expensive and should only be used for debugging.
+      Instruction *I0 = isa<Instruction>(Bndl[0])
+                            ? cast<Instruction>(Bndl[0])
+                            : cast<Instruction>(UserBndl[0]);
+      assert(!Utils::verifyFunction(I0->getParent()->getParent(), dbgs()) &&
+             "Broken function!");
+    }
+#endif // NDEBUG
   }
-#endif
   return NewVec;
 }
 
@@ -402,7 +458,9 @@ bool BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) {
   Change = false;
   DeadInstrCandidates.clear();
   Legality->clear();
+  Actions.clear();
   vectorizeRec(Bndl, {}, /*Depth=*/0);
+  emitVectors();
   tryEraseDeadInstrs();
   return Change;
 }
@@ -411,7 +469,7 @@ bool BottomUpVec::runOnRegion(Region &Rgn, const Analyses &A) {
   const auto &SeedSlice = Rgn.getAux();
   assert(SeedSlice.size() >= 2 && "Bad slice!");
   Function &F = *SeedSlice[0]->getParent()->getParent();
-  IMaps = std::make_unique<InstrMaps>(F.getContext());
+  IMaps = std::make_unique<InstrMaps>();
   Legality = std::make_unique<LegalityAnalysis>(
       A.getAA(), A.getScalarEvolution(), F.getParent()->getDataLayout(),
       F.getContext(), *IMaps);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 746742e14d080..cdb8853f7503c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -3036,8 +3036,6 @@ bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
   Worklist.pushValue(NSV0B);
   Worklist.pushValue(NSV1A);
   Worklist.pushValue(NSV1B);
-  for (auto *S : Shuffles)
-    Worklist.add(S);
   return true;
 }
 
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll b/llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll
index 5234d8f107271..d4f7f82a8cff1 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/nusw-predicates.ll
@@ -15,6 +15,12 @@ define void @int_and_pointer_predicate(ptr %v, i32 %N) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
+; CHECK-NEXT:        Group [[GRP1:0x[0-9a-f]+]]:
+; CHECK-NEXT:          (Low: %v High: (2 + %v))
+; CHECK-NEXT:            Member: %v
+; CHECK-NEXT:        Group [[GRP2:0x[0-9a-f]+]]:
+; CHECK-NEXT:          (Low: %v High: (6 + (4 * (trunc i32 %N to i16)) + %v))
+; CHECK-NEXT:            Member: {%v,+,4}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -57,36 +63,36 @@ define void @int_and_multiple_pointer_predicates(ptr %v, ptr %w, i32 %N) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Check 0:
-; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Comparing group ([[GRP3:0x[0-9a-f]+]]):
 ; CHECK-NEXT:        ptr %v
-; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Against group ([[GRP4:0x[0-9a-f]+]]):
 ; CHECK-NEXT:        ptr %w
 ; CHECK-NEXT:      Check 1:
-; CHECK-NEXT:        Comparing group ([[GRP1]]):
+; CHECK-NEXT:        Comparing group ([[GRP3]]):
 ; CHECK-NEXT:        ptr %v
-; CHECK-NEXT:        Against group ([[GRP3:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Against group ([[GRP5:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.w = getelementptr { i16, i16 }, ptr %w, i16 %iv.i16
 ; CHECK-NEXT:      Check 2:
-; CHECK-NEXT:        Comparing group ([[GRP4:0x[0-9a-f]+]]):
+; CHECK-NEXT:        Comparing group ([[GRP6:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.v = getelementptr { i16, i16 }, ptr %v, i16 %iv.i16
-; CHECK-NEXT:        Against group ([[GRP2]]):
+; CHECK-NEXT:        Against group ([[GRP4]]):
 ; CHECK-NEXT:        ptr %w
 ; CHECK-NEXT:      Check 3:
-; CHECK-NEXT:        Comparing group ([[GRP4]]):
+; CHECK-NEXT:        Comparing group ([[GRP6]]):
 ; CHECK-NEXT:          %gep.v = getelementptr { i16, i16 }, ptr %v, i16 %iv.i16
-; CHECK-NEXT:        Against group ([[GRP3]]):
+; CHECK-NEXT:        Against group ([[GRP5]]):
 ; CHECK-NEXT:          %gep.w = getelementptr { i16, i16 }, ptr %w, i16 %iv.i16
 ; CHECK-NEXT:      Grouped accesses:
-; CHECK-NEXT:        Group [[GRP1]]:
+; CHECK-NEXT:        Group [[GRP3]]:
 ; CHECK-NEXT:          (Low: %v High: (2 + %v))
 ; CHECK-NEXT:            Member: %v
-; CHECK-NEXT:        Group [[GRP4]]:
+; CHECK-NEXT:        Group [[GRP6]]:
 ; CHECK-NEXT:          (Low: %v High: (6 + (4 * (trunc i32 %N to i16)) + %v))
 ; CHECK-NEXT:            Member: {%v,+,4}<%loop>
-; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:        Group [[GRP4]]:
 ; CHECK-NEXT:          (Low: %w High: (2 + %w))
 ; CHECK-NEXT:            Member: %w
-; CHECK-NEXT:        Group [[GRP3]]:
+; CHECK-NEXT:        Group [[GRP5]]:
 ; CHECK-NEXT:          (Low: %w High: (6 + (4 * (trunc i32 %N to i16)) + %w))
 ; CHECK-NEXT:            Member: {%w,+,4}<%loop>
 ; CHECK-EMPTY:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll
index 38b7389ae9083..021447d53f943 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis-forked-pointers.ll
@@ -163,7 +163,7 @@ exit:
 define void @dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs_may_wrap_1(ptr %a, ptr %b, ptr %c, i64 %offset, i64 %n) {
 ; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs_may_wrap_1'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: cannot check memory dependencies at runtime
+; CHECK-NEXT:      Report: cannot identify array bounds
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
@@ -204,7 +204,7 @@ exit:
 define void @dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs_may_wrap_2(ptr %a, ptr %b, ptr %c, i64 %offset, i64 %n) {
 ; CHECK-LABEL: 'dependency_check_and_runtime_checks_needed_select_of_ptr_add_recs_may_wrap_2'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Report: cannot check memory dependencies at runtime
+; CHECK-NEXT:      Report: cannot identify array bounds
 ; CHECK-NEXT:      Dependences:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis.ll b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis.ll
index 26c571b9cb63a..a15253a901488 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/retry-runtime-checks-after-dependence-analysis.ll
@@ -72,27 +72,27 @@ define void @dependency_check_and_runtime_checks_needed_gepb_not_inbounds_iv2_st
 ; CHECK-NEXT:        Comparing group ([[GRP4:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv
 ; CHECK-NEXT:        Against group ([[GRP5:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
+; CHECK-NEXT:          %gep.b = getelementptr i8, ptr %b, i64 %iv2
 ; CHECK-NEXT:      Check 1:
 ; CHECK-NEXT:        Comparing group ([[GRP4]]):
 ; CHECK-NEXT:          %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv
 ; CHECK-NEXT:        Against group ([[GRP6:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.b = getelementptr i8, ptr %b, i64 %iv2
+; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
 ; CHECK-NEXT:      Check 2:
 ; CHECK-NEXT:        Comparing group ([[GRP5]]):
-; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
-; CHECK-NEXT:        Against group ([[GRP6]]):
 ; CHECK-NEXT:          %gep.b = getelementptr i8, ptr %b, i64 %iv2
+; CHECK-NEXT:        Against group ([[GRP6]]):
+; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group [[GRP4]]:
 ; CHECK-NEXT:          (Low: %a High: ((4 * %n) + %a))
 ; CHECK-NEXT:            Member: {%a,+,4}<nuw><%loop>
 ; CHECK-NEXT:        Group [[GRP5]]:
-; CHECK-NEXT:          (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a))
-; CHECK-NEXT:            Member: {((4 * %offset) + %a),+,4}<%loop>
-; CHECK-NEXT:        Group [[GRP6]]:
 ; CHECK-NEXT:          (Low: %b High: (-1 + (5 * %n) + %b))
 ; CHECK-NEXT:            Member: {%b,+,5}<%loop>
+; CHECK-NEXT:        Group [[GRP6]]:
+; CHECK-NEXT:          (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a))
+; CHECK-NEXT:            Member: {((4 * %offset) + %a),+,4}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
@@ -265,27 +265,27 @@ define void @dependency_check_and_runtime_checks_needed_gepb_may_wrap(ptr %a, pt
 ; CHECK-NEXT:        Comparing group ([[GRP13:0x[0-9a-f]+]]):
 ; CHECK-NEXT:          %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv
 ; CHECK-NEXT:        Against group ([[GRP14:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
+; CHECK-NEXT:          %gep.b = getelementptr float, ptr %b, i64 %iv2
 ; CHECK-NEXT:      Check 1:
 ; CHECK-NEXT:        Comparing group ([[GRP13]]):
 ; CHECK-NEXT:          %gep.a.iv = getelementptr inbounds float, ptr %a, i64 %iv
 ; CHECK-NEXT:        Against group ([[GRP15:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.b = getelementptr float, ptr %b, i64 %iv2
+; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
 ; CHECK-NEXT:      Check 2:
 ; CHECK-NEXT:        Comparing group ([[GRP14]]):
-; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
-; CHECK-NEXT:        Against group ([[GRP15]]):
 ; CHECK-NEXT:          %gep.b = getelementptr float, ptr %b, i64 %iv2
+; CHECK-NEXT:        Against group ([[GRP15]]):
+; CHECK-NEXT:          %gep.a.iv.off = getelementptr inbounds float, ptr %a, i64 %iv.offset
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group [[GRP13]]:
 ; CHECK-NEXT:          (Low: %a High: ((4 * %n) + %a))
 ; CHECK-NEXT:            Member: {%a,+,4}<nuw><%loop>
 ; CHECK-NEXT:        Group [[GRP14]]:
-; CHECK-NEXT:          (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a))
-; CHECK-NEXT:            Member: {((4 * %offset) + %a),+,4}<%loop>
-; CHECK-NEXT:        Group [[GRP15]]:
 ; CHECK-NEXT:          (Low: %b High: (-4 + (8 * %n) + %b))
 ; CHECK-NEXT:            Member: {%b,+,8}<%loop>
+; CHECK-NEXT:        Group [[GRP15]]:
+; CHECK-NEXT:          (Low: ((4 * %offset) + %a) High: ((4 * %offset) + (4 * %n) + %a))
+; CHECK-NEXT:            Member: {((4 * %offset) + %a),+,4}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll b/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll
index b27937862b261..cce6f829d05af 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/runtime-checks-may-wrap.ll
@@ -11,20 +11,21 @@ define void @geps_may_wrap(ptr %a, ptr %b, i64 %N) {
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Check 0:
 ; CHECK-NEXT:        Comparing group ([[GRP1:0x[0-9a-f]+]]):
-; CHECK-NEXT:          %gep.iv = getelementptr i32, ptr %a, i64 %iv
-; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
 ; CHECK-NEXT:        ptr %b
+; CHECK-NEXT:        Against group ([[GRP2:0x[0-9a-f]+]]):
+; CHECK-NEXT:          %gep.iv = getelementptr i32, ptr %a, i64 %iv
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-NEXT:        Group [[GRP1]]:
-; CHECK-NEXT:          (Low: %a High: (16 + (12 * (trunc i128 ((zext i64 %N to i128) /u 3) to i16)) + %a))
-; CHECK-NEXT:            Member: {%a,+,12}<%loop>
-; CHECK-NEXT:        Group [[GRP2]]:
 ; CHECK-NEXT:          (Low: %b High: (4 + %b))
 ; CHECK-NEXT:            Member: %b
+; CHECK-NEXT:        Group [[GRP2]]:
+; CHECK-NEXT:          (Low: %a High: (16 + (12 * (trunc i128 ((zext i64 %N to i128) /u 3) to i16)) + %a))
+; CHECK-NEXT:            Member: {%a,+,12}<%loop>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
 ; CHECK-NEXT:      SCEV assumptions:
 ; CHECK-NEXT:      {0,+,3}<%loop> Added Flags: <nusw>
+; CHECK-NEXT:      {%a,+,12}<%loop> Added Flags: <nusw>
 ; CHECK-EMPTY:
 ; CHECK-NEXT:      Expressions re-written:
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 754b86ab2fb87..cf4fc143fe8c3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -1123,15 +1123,14 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; DEFAULT:       vector.scevcheck:
-; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
 ; DEFAULT-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
 ; DEFAULT-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
 ; DEFAULT-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
-; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[DST]]
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; DEFAULT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; DEFAULT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 4
 ; DEFAULT-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
 ; DEFAULT-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
 ; DEFAULT-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
@@ -1139,12 +1138,13 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
 ; DEFAULT-NEXT:    [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
 ; DEFAULT-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; DEFAULT-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 8
 ; DEFAULT-NEXT:    [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
 ; DEFAULT-NEXT:    [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
 ; DEFAULT-NEXT:    [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
 ; DEFAULT-NEXT:    [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT6]]
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT6]]
-; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[DST]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 [[MUL_RESULT6]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[SCEVGEP4]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
 ; DEFAULT-NEXT:    [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
 ; DEFAULT-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
@@ -1337,15 +1337,14 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
 ; PRED:       vector.scevcheck:
-; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 4
 ; PRED-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
 ; PRED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
 ; PRED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
 ; PRED-NEXT:    [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
-; PRED-NEXT:    [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
+; PRED-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT]]
+; PRED-NEXT:    [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[DST]]
 ; PRED-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
-; PRED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 8
+; PRED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 4
 ; PRED-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
 ; PRED-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
 ; PRED-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
@@ -1353,12 +1352,13 @@ define void @test_conditional_interleave_group (ptr noalias %src.1, ptr noalias
 ; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
 ; PRED-NEXT:    [[TMP7:%.*]] = icmp ult ptr [[TMP6]], [[SCEVGEP1]]
 ; PRED-NEXT:    [[TMP8:%.*]] = or i1 [[TMP7]], [[MUL_OVERFLOW4]]
+; PRED-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr [[DST]], i64 8
 ; PRED-NEXT:    [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[N]])
 ; PRED-NEXT:    [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
 ; PRED-NEXT:    [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
 ; PRED-NEXT:    [[TMP9:%.*]] = sub i64 0, [[MUL_RESULT6]]
-; PRED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT6]]
-; PRED-NEXT:    [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[DST]]
+; PRED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[SCEVGEP4]], i64 [[MUL_RESULT6]]
+; PRED-NEXT:    [[TMP11:%.*]] = icmp ult ptr [[TMP10]], [[SCEVGEP4]]
 ; PRED-NEXT:    [[TMP12:%.*]] = or i1 [[TMP11]], [[MUL_OVERFLOW7]]
 ; PRED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP4]], [[TMP8]]
 ; PRED-NEXT:    [[TMP14:%.*]] = or i1 [[TMP13]], [[TMP12]]
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
index 4ea248254f2c6..f7b8758084056 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
@@ -500,18 +500,47 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:  entry:
 ; STRIDED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; STRIDED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; STRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 24, i64 [[TMP1]])
+; STRIDED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.umax.i64(i64 80, i64 [[TMP1]])
 ; STRIDED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP2]]
 ; STRIDED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; STRIDED:       vector.scevcheck:
+; STRIDED-NEXT:    [[TMP24:%.*]] = shl i64 [[STRIDE:%.*]], 2
+; STRIDED-NEXT:    [[TMP25:%.*]] = mul i64 [[STRIDE]], -4
+; STRIDED-NEXT:    [[TMP26:%.*]] = icmp slt i64 [[TMP24]], 0
+; STRIDED-NEXT:    [[TMP27:%.*]] = select i1 [[TMP26]], i64 [[TMP25]], i64 [[TMP24]]
+; STRIDED-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[TMP27]], i64 1023)
+; STRIDED-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; STRIDED-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; STRIDED-NEXT:    [[TMP28:%.*]] = sub i64 0, [[MUL_RESULT]]
+; STRIDED-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[P2:%.*]], i64 [[MUL_RESULT]]
+; STRIDED-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP28]]
+; STRIDED-NEXT:    [[TMP31:%.*]] = icmp ult ptr [[TMP29]], [[P2]]
+; STRIDED-NEXT:    [[TMP32:%.*]] = icmp ugt ptr [[TMP30]], [[P2]]
+; STRIDED-NEXT:    [[TMP33:%.*]] = select i1 [[TMP26]], i1 [[TMP32]], i1 [[TMP31]]
+; STRIDED-NEXT:    [[TMP13:%.*]] = or i1 [[TMP33]], [[MUL_OVERFLOW]]
+; STRIDED-NEXT:    [[TMP34:%.*]] = icmp slt i64 [[TMP24]], 0
+; STRIDED-NEXT:    [[TMP15:%.*]] = select i1 [[TMP34]], i64 [[TMP25]], i64 [[TMP24]]
+; STRIDED-NEXT:    [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 [[TMP15]], i64 1023)
+; STRIDED-NEXT:    [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
+; STRIDED-NEXT:    [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
+; STRIDED-NEXT:    [[TMP16:%.*]] = sub i64 0, [[MUL_RESULT2]]
+; STRIDED-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[MUL_RESULT2]]
+; STRIDED-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP16]]
+; STRIDED-NEXT:    [[TMP37:%.*]] = icmp ult ptr [[TMP35]], [[P]]
+; STRIDED-NEXT:    [[TMP38:%.*]] = icmp ugt ptr [[TMP36]], [[P]]
+; STRIDED-NEXT:    [[TMP39:%.*]] = select i1 [[TMP34]], i1 [[TMP38]], i1 [[TMP37]]
+; STRIDED-NEXT:    [[TMP40:%.*]] = or i1 [[TMP39]], [[MUL_OVERFLOW3]]
+; STRIDED-NEXT:    [[TMP23:%.*]] = or i1 [[TMP13]], [[TMP40]]
+; STRIDED-NEXT:    br i1 [[TMP23]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK1:%.*]]
 ; STRIDED:       vector.memcheck:
-; STRIDED-NEXT:    [[TMP3:%.*]] = mul i64 [[STRIDE:%.*]], 4092
-; STRIDED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P2:%.*]], i64 [[TMP3]]
+; STRIDED-NEXT:    [[TMP3:%.*]] = mul i64 [[STRIDE]], 4092
+; STRIDED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[P2]], i64 [[TMP3]]
 ; STRIDED-NEXT:    [[TMP4:%.*]] = icmp ult ptr [[P2]], [[SCEVGEP]]
 ; STRIDED-NEXT:    [[UMIN:%.*]] = select i1 [[TMP4]], ptr [[P2]], ptr [[SCEVGEP]]
 ; STRIDED-NEXT:    [[TMP5:%.*]] = icmp ugt ptr [[P2]], [[SCEVGEP]]
 ; STRIDED-NEXT:    [[UMAX:%.*]] = select i1 [[TMP5]], ptr [[P2]], ptr [[SCEVGEP]]
 ; STRIDED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[UMAX]], i64 4
-; STRIDED-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP3]]
+; STRIDED-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP3]]
 ; STRIDED-NEXT:    [[TMP6:%.*]] = icmp ult ptr [[P]], [[SCEVGEP2]]
 ; STRIDED-NEXT:    [[UMIN3:%.*]] = select i1 [[TMP6]], ptr [[P]], ptr [[SCEVGEP2]]
 ; STRIDED-NEXT:    [[TMP7:%.*]] = icmp ugt ptr [[P]], [[SCEVGEP2]]
@@ -554,7 +583,7 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
 ; STRIDED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; STRIDED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; STRIDED:       scalar.ph:
-; STRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; STRIDED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_MEMCHECK1]] ]
 ; STRIDED-NEXT:    br label [[LOOP:%.*]]
 ; STRIDED:       loop:
 ; STRIDED-NEXT:    [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
index cf66264486095..b885d85a96800 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll
@@ -114,8 +114,18 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
 ; CHECK-SAME: ptr [[ARG:%.*]], i64 [[ARG1:%.*]], ptr [[ARG2:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[ARG1]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 18
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
+; CHECK:       [[VECTOR_SCEVCHECK]]:
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[ARG]], i64 16
+; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[ARG1]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ult ptr [[TMP2]], [[SCEVGEP]]
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    br i1 [[TMP4]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP20:%.*]] = shl i64 [[ARG1]], 4
 ; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[TMP20]], 16
@@ -167,7 +177,7 @@ define void @geps_feeding_interleave_groups_with_reuse(ptr %arg, i64 %arg1, ptr
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ], [ 0, %[[VECTOR_SCEVCHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
@@ -266,19 +276,26 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 52
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 64
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_SCEVCHECK:.*]]
 ; CHECK:       [[VECTOR_SCEVCHECK]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = lshr i64 [[N]], 3
-; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub i64 0, [[MUL_RESULT]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP41:%.*]] = icmp ult ptr [[TMP32]], [[A]]
+; CHECK-NEXT:    [[TMP44:%.*]] = or i1 [[TMP41]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
+; CHECK-NEXT:    [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1
+; CHECK-NEXT:    [[TMP55:%.*]] = sub i64 0, [[MUL_RESULT2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[SCEVGEP]], i64 [[MUL_RESULT2]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult ptr [[TMP4]], [[SCEVGEP]]
-; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW]]
-; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 28
+; CHECK-NEXT:    [[TMP57:%.*]] = or i1 [[TMP5]], [[MUL_OVERFLOW3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[MUL2:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT3:%.*]] = extractvalue { i64, i1 } [[MUL2]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW4:%.*]] = extractvalue { i64, i1 } [[MUL2]], 1
@@ -286,7 +303,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[SCEVGEP1]], i64 [[MUL_RESULT3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp ult ptr [[TMP8]], [[SCEVGEP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP9]], [[MUL_OVERFLOW4]]
-; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 20
+; CHECK-NEXT:    [[SCEVGEP5:%.*]] = getelementptr i8, ptr [[A]], i64 12
 ; CHECK-NEXT:    [[MUL6:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT7:%.*]] = extractvalue { i64, i1 } [[MUL6]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW8:%.*]] = extractvalue { i64, i1 } [[MUL6]], 1
@@ -302,7 +319,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[SCEVGEP9]], i64 [[MUL_RESULT11]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp ult ptr [[TMP16]], [[SCEVGEP9]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[MUL_OVERFLOW12]]
-; CHECK-NEXT:    [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[A]], i64 12
+; CHECK-NEXT:    [[SCEVGEP13:%.*]] = getelementptr i8, ptr [[A]], i64 20
 ; CHECK-NEXT:    [[MUL14:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT15:%.*]] = extractvalue { i64, i1 } [[MUL14]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW16:%.*]] = extractvalue { i64, i1 } [[MUL14]], 1
@@ -310,7 +327,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[SCEVGEP13]], i64 [[MUL_RESULT15]]
 ; CHECK-NEXT:    [[TMP21:%.*]] = icmp ult ptr [[TMP20]], [[SCEVGEP13]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = or i1 [[TMP21]], [[MUL_OVERFLOW16]]
-; CHECK-NEXT:    [[SCEVGEP17:%.*]] = getelementptr i8, ptr [[A]], i64 8
+; CHECK-NEXT:    [[SCEVGEP17:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[MUL18:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT19:%.*]] = extractvalue { i64, i1 } [[MUL18]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW20:%.*]] = extractvalue { i64, i1 } [[MUL18]], 1
@@ -318,7 +335,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[SCEVGEP17]], i64 [[MUL_RESULT19]]
 ; CHECK-NEXT:    [[TMP25:%.*]] = icmp ult ptr [[TMP24]], [[SCEVGEP17]]
 ; CHECK-NEXT:    [[TMP26:%.*]] = or i1 [[TMP25]], [[MUL_OVERFLOW20]]
-; CHECK-NEXT:    [[SCEVGEP21:%.*]] = getelementptr i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[SCEVGEP21:%.*]] = getelementptr i8, ptr [[A]], i64 28
 ; CHECK-NEXT:    [[MUL22:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT23:%.*]] = extractvalue { i64, i1 } [[MUL22]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW24:%.*]] = extractvalue { i64, i1 } [[MUL22]], 1
@@ -326,37 +343,47 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[SCEVGEP21]], i64 [[MUL_RESULT23]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp ult ptr [[TMP28]], [[SCEVGEP21]]
 ; CHECK-NEXT:    [[TMP30:%.*]] = or i1 [[TMP29]], [[MUL_OVERFLOW24]]
+; CHECK-NEXT:    [[SCEVGEP31:%.*]] = getelementptr i8, ptr [[B]], i64 4
+; CHECK-NEXT:    [[MUL29:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP2]])
+; CHECK-NEXT:    [[MUL_RESULT30:%.*]] = extractvalue { i64, i1 } [[MUL29]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW31:%.*]] = extractvalue { i64, i1 } [[MUL29]], 1
+; CHECK-NEXT:    [[TMP67:%.*]] = sub i64 0, [[MUL_RESULT30]]
+; CHECK-NEXT:    [[TMP68:%.*]] = getelementptr i8, ptr [[SCEVGEP31]], i64 [[MUL_RESULT30]]
+; CHECK-NEXT:    [[TMP69:%.*]] = icmp ult ptr [[TMP68]], [[SCEVGEP31]]
+; CHECK-NEXT:    [[TMP70:%.*]] = or i1 [[TMP69]], [[MUL_OVERFLOW31]]
 ; CHECK-NEXT:    [[MUL25:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 32, i64 [[TMP2]])
 ; CHECK-NEXT:    [[MUL_RESULT26:%.*]] = extractvalue { i64, i1 } [[MUL25]], 0
 ; CHECK-NEXT:    [[MUL_OVERFLOW27:%.*]] = extractvalue { i64, i1 } [[MUL25]], 1
 ; CHECK-NEXT:    [[TMP31:%.*]] = sub i64 0, [[MUL_RESULT26]]
-; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[MUL_RESULT26]]
-; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult ptr [[TMP32]], [[A]]
+; CHECK-NEXT:    [[TMP71:%.*]] = getelementptr i8, ptr [[B]], i64 [[MUL_RESULT26]]
+; CHECK-NEXT:    [[TMP33:%.*]] = icmp ult ptr [[TMP71]], [[B]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = or i1 [[TMP33]], [[MUL_OVERFLOW27]]
+; CHECK-NEXT:    [[TMP6:%.*]] = or i1 [[TMP44]], [[TMP57]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = or i1 [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP36:%.*]] = or i1 [[TMP35]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP37:%.*]] = or i1 [[TMP36]], [[TMP18]]
 ; CHECK-NEXT:    [[TMP38:%.*]] = or i1 [[TMP37]], [[TMP22]]
 ; CHECK-NEXT:    [[TMP39:%.*]] = or i1 [[TMP38]], [[TMP26]]
 ; CHECK-NEXT:    [[TMP40:%.*]] = or i1 [[TMP39]], [[TMP30]]
-; CHECK-NEXT:    [[TMP41:%.*]] = or i1 [[TMP40]], [[TMP34]]
-; CHECK-NEXT:    br i1 [[TMP41]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK-NEXT:    [[TMP72:%.*]] = or i1 [[TMP40]], [[TMP70]]
+; CHECK-NEXT:    [[TMP73:%.*]] = or i1 [[TMP72]], [[TMP34]]
+; CHECK-NEXT:    br i1 [[TMP73]], label %[[SCALAR_PH]], label %[[VECTOR_MEMCHECK:.*]]
 ; CHECK:       [[VECTOR_MEMCHECK]]:
 ; CHECK-NEXT:    [[TMP42:%.*]] = lshr i64 [[N]], 3
 ; CHECK-NEXT:    [[TMP43:%.*]] = shl i64 [[TMP42]], 5
-; CHECK-NEXT:    [[TMP44:%.*]] = add i64 [[TMP43]], 32
-; CHECK-NEXT:    [[SCEVGEP28:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP44]]
 ; CHECK-NEXT:    [[TMP45:%.*]] = add nuw nsw i64 [[TMP43]], 4
 ; CHECK-NEXT:    [[SCEVGEP29:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP45]]
+; CHECK-NEXT:    [[TMP53:%.*]] = add i64 [[TMP43]], 32
+; CHECK-NEXT:    [[SCEVGEP28:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP53]]
 ; CHECK-NEXT:    [[TMP46:%.*]] = shl i64 [[TMP42]], 4
 ; CHECK-NEXT:    [[TMP47:%.*]] = add nuw nsw i64 [[TMP46]], 8
 ; CHECK-NEXT:    [[SCEVGEP30:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP47]]
-; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[A]], [[SCEVGEP29]]
 ; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP28]]
-; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
-; CHECK-NEXT:    [[BOUND031:%.*]] = icmp ult ptr [[A]], [[SCEVGEP30]]
+; CHECK-NEXT:    [[BOUND2:%.*]] = icmp ult ptr [[A]], [[SCEVGEP29]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND1]], [[BOUND2]]
 ; CHECK-NEXT:    [[BOUND132:%.*]] = icmp ult ptr [[B]], [[SCEVGEP28]]
-; CHECK-NEXT:    [[FOUND_CONFLICT33:%.*]] = and i1 [[BOUND031]], [[BOUND132]]
+; CHECK-NEXT:    [[BOUND133:%.*]] = icmp ult ptr [[A]], [[SCEVGEP30]]
+; CHECK-NEXT:    [[FOUND_CONFLICT33:%.*]] = and i1 [[BOUND132]], [[BOUND133]]
 ; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT33]]
 ; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
@@ -378,7 +405,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
 ; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP50]]
 ; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison), !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison), !alias.scope [[META6:![0-9]+]], !noalias [[META9:![0-9]+]]
 ; CHECK-NEXT:    [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -390,7 +417,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 32)
 ; CHECK-NEXT:    [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP64]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP64]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -431,7 +458,7 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N)
 ; CHECK-NEXT:    store i32 0, ptr [[GEP_A_7]], align 4
 ; CHECK-NEXT:    [[IV_NEXT_7]] = add nuw nsw i64 [[IV]], 8
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -498,7 +525,7 @@ define void @interleave_store_double_i64(ptr %dst) {
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -512,7 +539,7 @@ define void @interleave_store_double_i64(ptr %dst) {
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_0]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -620,7 +647,7 @@ define void @interleave_store_i64_double_2(ptr %dst) {
 ; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], splat (i64 2)
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
-; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br i1 true, label %[[EXIT:.*]], label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -634,7 +661,7 @@ define void @interleave_store_i64_double_2(ptr %dst) {
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[GEP_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV]], 1
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -670,10 +697,12 @@ attributes #1 = { "min-legal-vector-width"="0" "target-cpu"="cascadelake" }
 ; CHECK: [[META6]] = !{[[META7:![0-9]+]]}
 ; CHECK: [[META7]] = distinct !{[[META7]], [[META8:![0-9]+]]}
 ; CHECK: [[META8]] = distinct !{[[META8]], !"LVerDomain"}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]]}
+; CHECK: [[META9]] = !{[[META10:![0-9]+]]}
+; CHECK: [[META10]] = distinct !{[[META10]], [[META8]]}
 ; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]]}
 ; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
index c076c0e849fa9..fc5795708c7d8 100644
--- a/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/bottomup_basic.ll
@@ -352,3 +352,37 @@ define void @diamondWithConstantVector(ptr %ptr) {
   store i32 %orB1, ptr %gepB1
   ret void
 }
+
+; Check that we don't get def-after-use errors due to wrong placement
+; of new vector instructions.
+define void @vecInstrsPlacement(ptr %ptr0) {
+; CHECK-LABEL: define void @vecInstrsPlacement(
+; CHECK-SAME: ptr [[PTR0:%.*]]) {
+; CHECK-NEXT:    [[VECL2:%.*]] = load <2 x double>, ptr [[PTR0]], align 8
+; CHECK-NEXT:    [[VECL:%.*]] = load <2 x double>, ptr [[PTR0]], align 8
+; CHECK-NEXT:    [[VEC2:%.*]] = fmul <2 x double> [[VECL]], [[VECL2]]
+; CHECK-NEXT:    [[VEC:%.*]] = fmul <2 x double> [[VECL]], [[VECL2]]
+; CHECK-NEXT:    [[VEC5:%.*]] = fadd <2 x double> [[VEC]], [[VEC2]]
+; CHECK-NEXT:    store <2 x double> [[VEC5]], ptr [[PTR0]], align 8
+; CHECK-NEXT:    ret void
+;
+  %ptr1 = getelementptr inbounds double, ptr %ptr0, i64 1
+  %ldA_0 = load double, ptr %ptr0
+  %ldA_1 = load double, ptr %ptr1
+
+  %ldB_0 = load double, ptr %ptr0
+  %ldB_1 = load double, ptr %ptr1
+
+  %mul0 = fmul double %ldA_0, %ldB_0
+  %mul1 = fmul double %ldA_1, %ldB_1
+
+  %mul2 = fmul double %ldA_0, %ldB_0
+  %mul3 = fmul double %ldA_1, %ldB_1
+
+  %add0 = fadd double %mul0, %mul2
+  %add1 = fadd double %mul1, %mul3
+
+  store double %add0, ptr %ptr0
+  store double %add1, ptr %ptr1
+  ret void
+}
diff --git a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll
index 7741d8c64c8fc..5b9177ba4b3bf 100644
--- a/llvm/test/Transforms/SandboxVectorizer/scheduler.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/scheduler.ll
@@ -7,17 +7,17 @@ define void @check_dag_scheduler_update(ptr noalias %p, ptr noalias %p1) {
 ; CHECK-LABEL: define void @check_dag_scheduler_update(
 ; CHECK-SAME: ptr noalias [[P:%.*]], ptr noalias [[P1:%.*]]) {
 ; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[P]], align 4
-; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32
+; CHECK-NEXT:    [[VECL:%.*]] = load <4 x i32>, ptr [[P]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 34
 ; CHECK-NEXT:    [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33
 ; CHECK-NEXT:    [[I4:%.*]] = load i32, ptr [[ARRAYIDX11]], align 4
-; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i32, ptr [[P]], i64 34
+; CHECK-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr i32, ptr [[P]], i64 32
 ; CHECK-NEXT:    [[I6:%.*]] = load i32, ptr [[ARRAYIDX18]], align 4
 ; CHECK-NEXT:    [[PACK:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0
-; CHECK-NEXT:    [[PACK1:%.*]] = insertelement <4 x i32> [[PACK]], i32 [[I2]], i32 1
+; CHECK-NEXT:    [[PACK1:%.*]] = insertelement <4 x i32> [[PACK]], i32 [[I6]], i32 1
 ; CHECK-NEXT:    [[PACK2:%.*]] = insertelement <4 x i32> [[PACK1]], i32 [[I4]], i32 2
-; CHECK-NEXT:    [[PACK3:%.*]] = insertelement <4 x i32> [[PACK2]], i32 [[I6]], i32 3
-; CHECK-NEXT:    [[VECL:%.*]] = load <4 x i32>, ptr [[P]], align 4
+; CHECK-NEXT:    [[PACK3:%.*]] = insertelement <4 x i32> [[PACK2]], i32 [[I2]], i32 3
 ; CHECK-NEXT:    [[VEC:%.*]] = add nsw <4 x i32> [[PACK3]], [[VECL]]
 ; CHECK-NEXT:    store <4 x i32> [[VEC]], ptr [[P1]], align 4
 ; CHECK-NEXT:    ret void
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp
index 5b033f0edcb02..c8fee1c24dbcb 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/InstrMapsTest.cpp
@@ -53,37 +53,41 @@ define void @foo(i8 %v0, i8 %v1, i8 %v2, i8 %v3, <2 x i8> %vec) {
   auto *VAdd0 = cast<sandboxir::BinaryOperator>(&*It++);
   [[maybe_unused]] auto *Ret = cast<sandboxir::ReturnInst>(&*It++);
 
-  sandboxir::InstrMaps IMaps(Ctx);
-  // Check with empty IMaps.
-  EXPECT_EQ(IMaps.getVectorForOrig(Add0), nullptr);
-  EXPECT_EQ(IMaps.getVectorForOrig(Add1), nullptr);
-  EXPECT_FALSE(IMaps.getOrigLane(Add0, Add0));
-  // Check with 1 match.
-  IMaps.registerVector({Add0, Add1}, VAdd0);
-  EXPECT_EQ(IMaps.getVectorForOrig(Add0), VAdd0);
-  EXPECT_EQ(IMaps.getVectorForOrig(Add1), VAdd0);
-  EXPECT_FALSE(IMaps.getOrigLane(VAdd0, VAdd0)); // Bad Orig value
-  EXPECT_FALSE(IMaps.getOrigLane(Add0, Add0));   // Bad Vector value
-  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add0), 0U);
-  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add1), 1U);
-  // Check when the same vector maps to different original values (which is
-  // common for vector constants).
-  IMaps.registerVector({Add2, Add3}, VAdd0);
-  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add2), 0U);
-  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add3), 1U);
-  // Check when we register for a second time.
+  sandboxir::InstrMaps IMaps;
+  {
+    // Check with empty IMaps.
+    sandboxir::Action A(nullptr, {Add0}, {}, 0);
+    EXPECT_EQ(IMaps.getVectorForOrig(Add0), nullptr);
+    EXPECT_EQ(IMaps.getVectorForOrig(Add1), nullptr);
+    EXPECT_FALSE(IMaps.getOrigLane(&A, Add0));
+  }
+  {
+    // Check with 1 match.
+    sandboxir::Action A(nullptr, {Add0, Add1}, {}, 0);
+    sandboxir::Action OtherA(nullptr, {}, {}, 0);
+    IMaps.registerVector({Add0, Add1}, &A);
+    EXPECT_EQ(IMaps.getVectorForOrig(Add0), &A);
+    EXPECT_EQ(IMaps.getVectorForOrig(Add1), &A);
+    EXPECT_FALSE(IMaps.getOrigLane(&A, VAdd0));     // Bad Orig value
+    EXPECT_FALSE(IMaps.getOrigLane(&OtherA, Add0)); // Bad Vector value
+    EXPECT_EQ(*IMaps.getOrigLane(&A, Add0), 0U);
+    EXPECT_EQ(*IMaps.getOrigLane(&A, Add1), 1U);
+  }
+  {
+    // Check when the same vector maps to different original values (which is
+    // common for vector constants).
+    sandboxir::Action A(nullptr, {Add2, Add3}, {}, 0);
+    IMaps.registerVector({Add2, Add3}, &A);
+    EXPECT_EQ(*IMaps.getOrigLane(&A, Add2), 0U);
+    EXPECT_EQ(*IMaps.getOrigLane(&A, Add3), 1U);
+  }
+  {
+    // Check when we register for a second time.
+    sandboxir::Action A(nullptr, {Add2, Add3}, {}, 0);
 #ifndef NDEBUG
-  EXPECT_DEATH(IMaps.registerVector({Add1, Add0}, VAdd0), ".*exists.*");
+    EXPECT_DEATH(IMaps.registerVector({Add1, Add0}, &A), ".*exists.*");
 #endif // NDEBUG
-  // Check callbacks: erase original instr.
-  Add0->eraseFromParent();
-  EXPECT_FALSE(IMaps.getOrigLane(VAdd0, Add0));
-  EXPECT_EQ(*IMaps.getOrigLane(VAdd0, Add1), 1U);
-  EXPECT_EQ(IMaps.getVectorForOrig(Add0), nullptr);
-  // Check callbacks: erase vector instr.
-  VAdd0->eraseFromParent();
-  EXPECT_FALSE(IMaps.getOrigLane(VAdd0, Add1));
-  EXPECT_EQ(IMaps.getVectorForOrig(Add1), nullptr);
+  }
 }
 
 TEST_F(InstrMapsTest, VectorLanes) {
@@ -91,7 +95,6 @@ TEST_F(InstrMapsTest, VectorLanes) {
 define void @foo(<2 x i8> %v0, <2 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3) {
   %vadd0 = add <2 x i8> %v0, %v1
   %vadd1 = add <2 x i8> %v0, %v1
-  %vadd2 = add <4 x i8> %v2, %v3
   ret void
 }
 )IR");
@@ -103,12 +106,14 @@ define void @foo(<2 x i8> %v0, <2 x i8> %v1, <4 x i8> %v2, <4 x i8> %v3) {
 
   auto *VAdd0 = cast<sandboxir::BinaryOperator>(&*It++);
   auto *VAdd1 = cast<sandboxir::BinaryOperator>(&*It++);
-  auto *VAdd2 = cast<sandboxir::BinaryOperator>(&*It++);
 
-  sandboxir::InstrMaps IMaps(Ctx);
+  sandboxir::InstrMaps IMaps;
 
-  // Check that the vector lanes are calculated correctly.
-  IMaps.registerVector({VAdd0, VAdd1}, VAdd2);
-  EXPECT_EQ(*IMaps.getOrigLane(VAdd2, VAdd0), 0U);
-  EXPECT_EQ(*IMaps.getOrigLane(VAdd2, VAdd1), 2U);
+  {
+    // Check that the vector lanes are calculated correctly.
+    sandboxir::Action A(nullptr, {VAdd0, VAdd1}, {}, 0);
+    IMaps.registerVector({VAdd0, VAdd1}, &A);
+    EXPECT_EQ(*IMaps.getOrigLane(&A, VAdd0), 0U);
+    EXPECT_EQ(*IMaps.getOrigLane(&A, VAdd1), 2U);
+  }
 }
diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
index 15f8166b705fc..99519d17d0e8e 100644
--- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/LegalityTest.cpp
@@ -133,7 +133,7 @@ define void @foo(ptr %ptr, <2 x float> %vec2, <3 x float> %vec3, i8 %arg, float
   auto *Sel0 = cast<sandboxir::SelectInst>(&*It++);
   auto *Sel1 = cast<sandboxir::SelectInst>(&*It++);
 
-  llvm::sandboxir::InstrMaps IMaps(Ctx);
+  llvm::sandboxir::InstrMaps IMaps;
   sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx, IMaps);
   const auto &Result =
       Legality.canVectorize({St0, St1}, /*SkipScheduling=*/true);
@@ -285,7 +285,7 @@ define void @foo(ptr %ptr) {
   auto *St0 = cast<sandboxir::StoreInst>(&*It++);
   auto *St1 = cast<sandboxir::StoreInst>(&*It++);
 
-  llvm::sandboxir::InstrMaps IMaps(Ctx);
+  llvm::sandboxir::InstrMaps IMaps;
   sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx, IMaps);
   {
     // Can vectorize St0,St1.
@@ -321,7 +321,7 @@ define void @foo() {
   };
 
   sandboxir::Context Ctx(C);
-  llvm::sandboxir::InstrMaps IMaps(Ctx);
+  llvm::sandboxir::InstrMaps IMaps;
   sandboxir::LegalityAnalysis Legality(*AA, *SE, DL, Ctx, IMaps);
   EXPECT_TRUE(
       Matches(Legality.createLegalityResult<sandboxir::Widen>(), "Widen"));
@@ -368,32 +368,34 @@ define void @foo(ptr %ptr) {
 
   sandboxir::CollectDescr::DescrVecT Descrs;
   using EEDescr = sandboxir::CollectDescr::ExtractElementDescr;
-
+  SmallVector<sandboxir::Value *> Bndl({VLd});
+  SmallVector<sandboxir::Value *> UB;
+  sandboxir::Action VLdA(nullptr, Bndl, UB, 0);
   {
     // Check single input, no shuffle.
-    Descrs.push_back(EEDescr(VLd, 0));
-    Descrs.push_back(EEDescr(VLd, 1));
+    Descrs.push_back(EEDescr(&VLdA, 0));
+    Descrs.push_back(EEDescr(&VLdA, 1));
     sandboxir::CollectDescr CD(std::move(Descrs));
     EXPECT_TRUE(CD.getSingleInput());
-    EXPECT_EQ(CD.getSingleInput()->first, VLd);
+    EXPECT_EQ(CD.getSingleInput()->first, &VLdA);
     EXPECT_THAT(CD.getSingleInput()->second, testing::ElementsAre(0, 1));
     EXPECT_TRUE(CD.hasVectorInputs());
   }
   {
     // Check single input, shuffle.
-    Descrs.push_back(EEDescr(VLd, 1));
-    Descrs.push_back(EEDescr(VLd, 0));
+    Descrs.push_back(EEDescr(&VLdA, 1));
+    Descrs.push_back(EEDescr(&VLdA, 0));
     sandboxir::CollectDescr CD(std::move(Descrs));
     EXPECT_TRUE(CD.getSingleInput());
-    EXPECT_EQ(CD.getSingleInput()->first, VLd);
+    EXPECT_EQ(CD.getSingleInput()->first, &VLdA);
     EXPECT_THAT(CD.getSingleInput()->second, testing::ElementsAre(1, 0));
     EXPECT_TRUE(CD.hasVectorInputs());
   }
   {
     // Check multiple inputs.
     Descrs.push_back(EEDescr(Ld0));
-    Descrs.push_back(EEDescr(VLd, 0));
-    Descrs.push_back(EEDescr(VLd, 1));
+    Descrs.push_back(EEDescr(&VLdA, 0));
+    Descrs.push_back(EEDescr(&VLdA, 1));
     sandboxir::CollectDescr CD(std::move(Descrs));
     EXPECT_FALSE(CD.getSingleInput());
     EXPECT_TRUE(CD.hasVectorInputs());
diff --git a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
index ff4f558ca2fcf..fdd631bc40398 100644
--- a/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/CodeGen/BUILD.gn
@@ -93,6 +93,7 @@ static_library("CodeGen") {
     "CodeGenTypes.cpp",
     "ConstantInitBuilder.cpp",
     "CoverageMappingGen.cpp",
+    "HLSLBufferLayoutBuilder.cpp",
     "ItaniumCXXABI.cpp",
     "LinkInModulesPass.cpp",
     "MacroPPCallbacks.cpp",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 05385ba491525..8f9a7e388ebc7 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -11136,6 +11136,7 @@ td_library(
         "include/mlir/Dialect/Linalg/IR/LinalgEnums.td",
         "include/mlir/Dialect/Linalg/IR/LinalgInterfaces.td",
         "include/mlir/Dialect/Linalg/IR/LinalgOps.td",
+        "include/mlir/Dialect/Linalg/IR/LinalgRelayoutOps.td",
     ],
     includes = ["include"],
     deps = [