diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
index 5e64d23874ec17..c25ee42d0899ae 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.cpp
@@ -67,7 +67,8 @@ SizeofExpressionCheck::SizeofExpressionCheck(StringRef Name,
       WarnOnSizeOfCompareToConstant(
           Options.get("WarnOnSizeOfCompareToConstant", true)),
       WarnOnSizeOfPointerToAggregate(
-          Options.get("WarnOnSizeOfPointerToAggregate", true)) {}
+          Options.get("WarnOnSizeOfPointerToAggregate", true)),
+      WarnOnSizeOfPointer(Options.get("WarnOnSizeOfPointer", false)) {}
 
 void SizeofExpressionCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
   Options.store(Opts, "WarnOnSizeOfConstant", WarnOnSizeOfConstant);
@@ -78,6 +79,7 @@ void SizeofExpressionCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
                 WarnOnSizeOfCompareToConstant);
   Options.store(Opts, "WarnOnSizeOfPointerToAggregate",
                 WarnOnSizeOfPointerToAggregate);
+  Options.store(Opts, "WarnOnSizeOfPointer", WarnOnSizeOfPointer);
 }
 
 void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
@@ -127,17 +129,30 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
   const auto ConstStrLiteralDecl =
       varDecl(isDefinition(), hasType(hasCanonicalType(CharPtrType)),
               hasInitializer(ignoringParenImpCasts(stringLiteral())));
+  const auto VarWithConstStrLiteralDecl = expr(
+      hasType(hasCanonicalType(CharPtrType)),
+      ignoringParenImpCasts(declRefExpr(hasDeclaration(ConstStrLiteralDecl))));
   Finder->addMatcher(
-      sizeOfExpr(has(ignoringParenImpCasts(
-                     expr(hasType(hasCanonicalType(CharPtrType)),
-                          ignoringParenImpCasts(declRefExpr(
-                              hasDeclaration(ConstStrLiteralDecl)))))))
+      sizeOfExpr(has(ignoringParenImpCasts(VarWithConstStrLiteralDecl)))
           .bind("sizeof-charp"),
       this);
 
-  // Detect sizeof(ptr) where ptr points to an aggregate (i.e. sizeof(&S)).
-  // Do not find it if RHS of a 'sizeof(arr) / sizeof(arr[0])' expression.
-  if (WarnOnSizeOfPointerToAggregate) {
+  // Detect sizeof(ptr) where ptr is a pointer (CWE-467).
+  //
+  // In WarnOnSizeOfPointerToAggregate mode only report cases when ptr points
+  // to an aggregate type or ptr is an expression that (implicitly or
+  // explicitly) casts an array to a pointer type. (These are more suspicious
+  // than other sizeof(ptr) expressions because they can appear as distorted
+  // forms of the common sizeof(aggregate) expressions.)
+  //
+  // To avoid false positives, the check doesn't report expressions like
+  // 'sizeof(pp[0])' and 'sizeof(*pp)' where `pp` is a pointer-to-pointer or
+  // array of pointers. (This filters out both `sizeof(arr) / sizeof(arr[0])`
+  // expressions and other cases like `p = realloc(p, newsize * sizeof(*p));`.)
+  //
+  // Moreover this generic message is suppressed in cases that are also matched
+  // by the more concrete matchers 'sizeof-this' and 'sizeof-charp'.
+  if (WarnOnSizeOfPointerToAggregate || WarnOnSizeOfPointer) {
     const auto ArrayExpr =
         ignoringParenImpCasts(hasType(hasCanonicalType(arrayType())));
     const auto ArrayCastExpr = expr(anyOf(
@@ -149,32 +164,31 @@ void SizeofExpressionCheck::registerMatchers(MatchFinder *Finder) {
 
     const auto PointerToStructType =
         hasUnqualifiedDesugaredType(pointerType(pointee(recordType())));
-    const auto PointerToStructExpr = expr(
-        hasType(hasCanonicalType(PointerToStructType)), unless(cxxThisExpr()));
-
-    const auto ArrayOfPointersExpr = ignoringParenImpCasts(
-        hasType(hasCanonicalType(arrayType(hasElementType(pointerType()))
-                                     .bind("type-of-array-of-pointers"))));
-    const auto ArrayOfSamePointersExpr =
-        ignoringParenImpCasts(hasType(hasCanonicalType(
-            arrayType(equalsBoundNode("type-of-array-of-pointers")))));
+    const auto PointerToStructTypeWithBinding =
+        type(PointerToStructType).bind("struct-type");
+    const auto PointerToStructExpr =
+        expr(hasType(hasCanonicalType(PointerToStructType)));
+
+    const auto PointerToDetectedExpr =
+        WarnOnSizeOfPointer
+            ? expr(hasType(hasUnqualifiedDesugaredType(pointerType())))
+            : expr(anyOf(ArrayCastExpr, PointerToArrayExpr,
+                         PointerToStructExpr));
+
     const auto ZeroLiteral = ignoringParenImpCasts(integerLiteral(equals(0)));
-    const auto ArrayOfSamePointersZeroSubscriptExpr =
-        ignoringParenImpCasts(arraySubscriptExpr(
-            hasBase(ArrayOfSamePointersExpr), hasIndex(ZeroLiteral)));
-    const auto ArrayLengthExprDenom =
-        expr(hasParent(binaryOperator(hasOperatorName("/"),
-                                      hasLHS(ignoringParenImpCasts(sizeOfExpr(
-                                          has(ArrayOfPointersExpr)))))),
-             sizeOfExpr(has(ArrayOfSamePointersZeroSubscriptExpr)));
+    const auto SubscriptExprWithZeroIndex =
+        arraySubscriptExpr(hasIndex(ZeroLiteral));
+    const auto DerefExpr =
+        ignoringParenImpCasts(unaryOperator(hasOperatorName("*")));
 
     Finder->addMatcher(
-        expr(sizeOfExpr(anyOf(
-                 has(ignoringParenImpCasts(anyOf(
-                     ArrayCastExpr, PointerToArrayExpr, PointerToStructExpr))),
-                 has(PointerToStructType))),
-             unless(ArrayLengthExprDenom))
-            .bind("sizeof-pointer-to-aggregate"),
+        expr(sizeOfExpr(anyOf(has(ignoringParenImpCasts(
+                                  expr(PointerToDetectedExpr, unless(DerefExpr),
+                                       unless(SubscriptExprWithZeroIndex),
+                                       unless(VarWithConstStrLiteralDecl),
+                                       unless(cxxThisExpr())))),
+                              has(PointerToStructTypeWithBinding))))
+            .bind("sizeof-pointer"),
         this);
   }
 
@@ -292,11 +306,17 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
     diag(E->getBeginLoc(),
          "suspicious usage of 'sizeof(char*)'; do you mean 'strlen'?")
         << E->getSourceRange();
-  } else if (const auto *E =
-                 Result.Nodes.getNodeAs<Expr>("sizeof-pointer-to-aggregate")) {
-    diag(E->getBeginLoc(),
-         "suspicious usage of 'sizeof(A*)'; pointer to aggregate")
-        << E->getSourceRange();
+  } else if (const auto *E = Result.Nodes.getNodeAs<Expr>("sizeof-pointer")) {
+    if (Result.Nodes.getNodeAs<Type>("struct-type")) {
+      diag(E->getBeginLoc(),
+           "suspicious usage of 'sizeof(A*)' on pointer-to-aggregate type; did "
+           "you mean 'sizeof(A)'?")
+          << E->getSourceRange();
+    } else {
+      diag(E->getBeginLoc(), "suspicious usage of 'sizeof()' on an expression "
+                             "that results in a pointer")
+          << E->getSourceRange();
+    }
   } else if (const auto *E = Result.Nodes.getNodeAs<BinaryOperator>(
                  "sizeof-compare-constant")) {
     diag(E->getOperatorLoc(),
@@ -332,18 +352,23 @@ void SizeofExpressionCheck::check(const MatchFinder::MatchResult &Result) {
                                 " numerator is not a multiple of denominator")
           << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
     } else if (NumTy && DenomTy && NumTy == DenomTy) {
+      // FIXME: This message is wrong, it should not refer to sizeof "pointer"
+      // usage (and by the way, it would be to clarify all the messages).
       diag(E->getOperatorLoc(),
            "suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)'")
           << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
-    } else if (PointedTy && DenomTy && PointedTy == DenomTy) {
-      diag(E->getOperatorLoc(),
-           "suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)'")
-          << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
-    } else if (NumTy && DenomTy && NumTy->isPointerType() &&
-               DenomTy->isPointerType()) {
-      diag(E->getOperatorLoc(),
-           "suspicious usage of sizeof pointer 'sizeof(P*)/sizeof(Q*)'")
-          << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
+    } else if (!WarnOnSizeOfPointer) {
+      // When 'WarnOnSizeOfPointer' is enabled, these messages become redundant:
+      if (PointedTy && DenomTy && PointedTy == DenomTy) {
+        diag(E->getOperatorLoc(),
+             "suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)'")
+            << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
+      } else if (NumTy && DenomTy && NumTy->isPointerType() &&
+                 DenomTy->isPointerType()) {
+        diag(E->getOperatorLoc(),
+             "suspicious usage of sizeof pointer 'sizeof(P*)/sizeof(Q*)'")
+            << E->getLHS()->getSourceRange() << E->getRHS()->getSourceRange();
+      }
     }
   } else if (const auto *E =
                  Result.Nodes.getNodeAs<Expr>("sizeof-sizeof-expr")) {
diff --git a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
index 55becdd4ecdba1..9ca17bc9e6f124 100644
--- a/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
+++ b/clang-tools-extra/clang-tidy/bugprone/SizeofExpressionCheck.h
@@ -30,6 +30,7 @@ class SizeofExpressionCheck : public ClangTidyCheck {
   const bool WarnOnSizeOfThis;
   const bool WarnOnSizeOfCompareToConstant;
   const bool WarnOnSizeOfPointerToAggregate;
+  const bool WarnOnSizeOfPointer;
 };
 
 } // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clangd/index/remote/CMakeLists.txt b/clang-tools-extra/clangd/index/remote/CMakeLists.txt
index ed6269d2ccaa98..106bbeff84ccf3 100644
--- a/clang-tools-extra/clangd/index/remote/CMakeLists.txt
+++ b/clang-tools-extra/clangd/index/remote/CMakeLists.txt
@@ -26,7 +26,6 @@ if (CLANGD_ENABLE_REMOTE)
     clangdRemoteIndexProto
     clangdRemoteIndexServiceProto
     clangdRemoteMarshalling
-    clangBasic
     clangDaemon
     clangdSupport
 
@@ -35,6 +34,11 @@ if (CLANGD_ENABLE_REMOTE)
     clangdRemoteIndexServiceProto
     )
 
+  clang_target_link_libraries(clangdRemoteIndex
+    PRIVATE
+    clangBasic
+    )
+
   add_subdirectory(marshalling)
   add_subdirectory(server)
   add_subdirectory(monitor)
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 0c0c10605a8307..6bf70c5cf4f8a5 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -237,6 +237,12 @@ Changes in existing checks
   <clang-tidy/checks/bugprone/optional-value-conversion>` check by eliminating
   false positives resulting from use of optionals in unevaluated context.
 
+- Improved :doc:`bugprone-sizeof-expression
+  <clang-tidy/checks/bugprone/sizeof-expression>` check by eliminating some
+  false positives and adding a new (off-by-default) option
+  `WarnOnSizeOfPointer` that reports all ``sizeof(pointer)`` expressions
+  (except for a few that are idiomatic).
+
 - Improved :doc:`bugprone-suspicious-include
   <clang-tidy/checks/bugprone/suspicious-include>` check by replacing the local
   options `HeaderFileExtensions` and `ImplementationFileExtensions` by the
@@ -331,6 +337,10 @@ Changes in existing checks
   <clang-tidy/checks/misc/header-include-cycle>` check by avoiding crash for self
   include cycles.
 
+- Improved :doc:`misc-include-cleaner
+  <clang-tidy/checks/misc/include-cleaner>` check by avoiding false positives for
+  the functions with the same name as standard library functions.
+
 - Improved :doc:`misc-unused-using-decls
   <clang-tidy/checks/misc/unused-using-decls>` check by replacing the local
   option `HeaderFileExtensions` by the global option of the same name.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
index c37df1706eb4e1..ed5bb4fbb89baf 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/sizeof-expression.rst
@@ -190,6 +190,15 @@ Options
 
 .. option:: WarnOnSizeOfPointerToAggregate
 
-   When `true`, the check will warn on an expression like
-   ``sizeof(expr)`` where the expression is a pointer
-   to aggregate. Default is `true`.
+   When `true`, the check will warn when the argument of ``sizeof`` is either a
+   pointer-to-aggregate type, an expression returning a pointer-to-aggregate
+   value or an expression that returns a pointer from an array-to-pointer
+   conversion (that may be implicit or explicit, for example ``array + 2`` or
+   ``(int *)array``). Default is `true`.
+
+.. option:: WarnOnSizeOfPointer
+
+   When `true`, the check will report all expressions where the argument of
+   ``sizeof`` is an expression that produces a pointer (except for a few
+   idiomatic expressions that are probably intentional and correct).
+   This detects occurrences of CWE 467. Default is `false`.
diff --git a/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp b/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp
index 78e783a62eb27f..9148d36a5038f9 100644
--- a/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp
+++ b/clang-tools-extra/include-cleaner/lib/LocateSymbol.cpp
@@ -14,6 +14,7 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/Tooling/Inclusions/StandardLibrary.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
 #include <utility>
 #include <vector>
 
@@ -40,8 +41,11 @@ Hints declHints(const Decl *D) {
 std::vector<Hinted<SymbolLocation>> locateDecl(const Decl &D) {
   std::vector<Hinted<SymbolLocation>> Result;
   // FIXME: Should we also provide physical locations?
-  if (auto SS = tooling::stdlib::Recognizer()(&D))
-    return {{*SS, Hints::CompleteSymbol}};
+  if (auto SS = tooling::stdlib::Recognizer()(&D)) {
+    Result.push_back({*SS, Hints::CompleteSymbol});
+    if (!D.hasBody())
+      return Result;
+  }
   // FIXME: Signal foreign decls, e.g. a forward declaration not owned by a
   // library. Some useful signals could be derived by checking the DeclContext.
   // Most incidental forward decls look like:
diff --git a/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp b/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
index 07302142a13e36..fdcbf25fd628c0 100644
--- a/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
+++ b/clang-tools-extra/include-cleaner/unittests/FindHeadersTest.cpp
@@ -628,6 +628,17 @@ TEST_F(HeadersForSymbolTest, StandardHeaders) {
                            tooling::stdlib::Header::named("<assert.h>")));
 }
 
+TEST_F(HeadersForSymbolTest, NonStandardHeaders) {
+  Inputs.Code = "void assert() {}";
+  buildAST();
+  EXPECT_THAT(
+      headersFor("assert"),
+      // Respect the ordering from the stdlib mapping.
+      UnorderedElementsAre(physicalHeader("input.mm"),
+                           tooling::stdlib::Header::named("<cassert>"),
+                           tooling::stdlib::Header::named("<assert.h>")));
+}
+
 TEST_F(HeadersForSymbolTest, ExporterNoNameMatch) {
   Inputs.Code = R"cpp(
     #include "exporter/foo.h"
diff --git a/clang-tools-extra/pseudo/lib/CMakeLists.txt b/clang-tools-extra/pseudo/lib/CMakeLists.txt
index f92f79be121508..a13b5d20cf7c3b 100644
--- a/clang-tools-extra/pseudo/lib/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/lib/CMakeLists.txt
@@ -14,8 +14,6 @@ add_clang_library(clangPseudo
   Token.cpp
 
   LINK_LIBS
-  clangBasic
-  clangLex
   clangPseudoGrammar
 
   DEPENDS
@@ -25,3 +23,9 @@ add_clang_library(clangPseudo
   target_include_directories(clangPseudo INTERFACE
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include>
   )
+
+clang_target_link_libraries(clangPseudo
+  PRIVATE
+  clangBasic
+  clangLex
+  )
diff --git a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
index d56d16c893c3d4..2fecdce6a10f9c 100644
--- a/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
+++ b/clang-tools-extra/pseudo/lib/cxx/CMakeLists.txt
@@ -9,7 +9,11 @@ add_clang_library(clangPseudoCXX
   cxx_gen
 
   LINK_LIBS
-  clangBasic
   clangPseudo
   clangPseudoGrammar
   )
+
+clang_target_link_libraries(clangPseudoCXX
+  PRIVATE
+  clangBasic
+  )
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c
index 8c4feb8f86169b..aef930f2c8fda7 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-2.c
@@ -34,24 +34,24 @@ int Test5() {
 
   int sum = 0;
   sum += sizeof(&S);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(__typeof(&S));
   sum += sizeof(&TS);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(__typeof(&TS));
   sum += sizeof(STRKWD MyStruct*);
   sum += sizeof(__typeof(STRKWD MyStruct*));
   sum += sizeof(TypedefStruct*);
   sum += sizeof(__typeof(TypedefStruct*));
   sum += sizeof(PTTS);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(PMyStruct);
   sum += sizeof(PS);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(PS2);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(&A10);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
 
 #ifdef __cplusplus
   MyStruct &rS = S;
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-any-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-any-pointer.cpp
new file mode 100644
index 00000000000000..bfb2ec3a9eb02c
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression-any-pointer.cpp
@@ -0,0 +1,241 @@
+// RUN: %check_clang_tidy %s bugprone-sizeof-expression %t -- -config="{CheckOptions: {bugprone-sizeof-expression.WarnOnSizeOfIntegerExpression: true, bugprone-sizeof-expression.WarnOnSizeOfPointer: true}}" --
+
+class C {
+  int size() { return sizeof(this); }
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(this)'
+};
+
+#define LEN 8
+
+int X;
+extern int A[10];
+extern short B[10];
+
+#pragma pack(1)
+struct  S { char a, b, c; };
+
+enum E { E_VALUE = 0 };
+enum class EC { VALUE = 0 };
+
+bool AsBool() { return false; }
+int AsInt() { return 0; }
+E AsEnum() { return E_VALUE; }
+EC AsEnumClass() { return EC::VALUE; }
+S AsStruct() { return {}; }
+
+struct M {
+  int AsInt() { return 0; }
+  E AsEnum() { return E_VALUE; }
+  S AsStruct() { return {}; }
+};
+
+int Test1(const char* ptr) {
+  int sum = 0;
+  sum += sizeof(LEN);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(K)'
+  sum += sizeof(LEN + 1);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(K)'
+  sum += sizeof(sum, LEN);
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: suspicious usage of 'sizeof(..., ...)'
+  sum += sizeof(AsBool());
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer
+  sum += sizeof(AsInt());
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer
+  sum += sizeof(AsEnum());
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer
+  sum += sizeof(AsEnumClass());
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer
+  sum += sizeof(M{}.AsInt());
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer
+  sum += sizeof(M{}.AsEnum());
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in an integer
+  sum += sizeof(sizeof(X));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))'
+  sum += sizeof(LEN + sizeof(X));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))'
+  sum += sizeof(LEN + LEN + sizeof(X));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))'
+  sum += sizeof(LEN + (LEN + sizeof(X)));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))'
+  sum += sizeof(LEN + -sizeof(X));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))'
+  sum += sizeof(LEN + - + -sizeof(X));
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(sizeof(...))'
+  sum += sizeof(char) / sizeof(char);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)'
+  sum += sizeof(A) / sizeof(S);
+  // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator
+  sum += sizeof(char) / sizeof(int);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator
+  sum += sizeof(char) / sizeof(A);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator
+  sum += sizeof(B[0]) / sizeof(A);
+  // CHECK-MESSAGES: :[[@LINE-1]]:23: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator
+  sum += sizeof(ptr) / sizeof(char);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(ptr) / sizeof(ptr[0]);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(ptr) / sizeof(char*);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(ptr) / sizeof(void*);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(ptr) / sizeof(const void volatile*);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(ptr) / sizeof(char);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(int) * sizeof(char);
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication
+  sum += sizeof(ptr) * sizeof(ptr[0]);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  // CHECK-MESSAGES: :[[@LINE-2]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication
+  sum += sizeof(int) * (2 * sizeof(char));
+  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication
+  sum += (2 * sizeof(char)) * sizeof(int);
+  // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious 'sizeof' by 'sizeof' multiplication
+  if (sizeof(A) < 0x100000) sum += 42;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: suspicious comparison of 'sizeof(expr)' to a constant
+  if (sizeof(A) <= 0xFFFFFFFEU) sum += 42;
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: suspicious comparison of 'sizeof(expr)' to a constant
+  return sum;
+}
+
+int Test5() {
+  typedef int Array10[10];
+  typedef C ArrayC[10];
+
+  struct MyStruct {
+    Array10 arr;
+    Array10* ptr;
+  };
+  typedef const MyStruct TMyStruct;
+  typedef const MyStruct *PMyStruct;
+  typedef TMyStruct *PMyStruct2;
+
+  static TMyStruct kGlocalMyStruct = {};
+  static TMyStruct volatile * kGlocalMyStructPtr = &kGlocalMyStruct;
+
+  MyStruct S;
+  PMyStruct PS;
+  PMyStruct2 PS2;
+  Array10 A10;
+  C *PtrArray[10];
+  C *PC;
+
+  char *PChar;
+  int *PInt, **PPInt;
+  MyStruct **PPMyStruct;
+
+  int sum = 0;
+  sum += sizeof(&S.arr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(&kGlocalMyStruct.arr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(&kGlocalMyStructPtr->arr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(S.arr + 0);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(+ S.arr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof((int*)S.arr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+
+  sum += sizeof(S.ptr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(kGlocalMyStruct.ptr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(kGlocalMyStructPtr->ptr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+
+  sum += sizeof(&kGlocalMyStruct);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(&S);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(MyStruct*);
+  sum += sizeof(PMyStruct);
+  sum += sizeof(PS);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(PS2);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(&A10);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(PtrArray) / sizeof(PtrArray[1]);
+  // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(A10) / sizeof(PtrArray[0]);
+  sum += sizeof(PC) / sizeof(PtrArray[0]);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  // CHECK-MESSAGES: :[[@LINE-2]]:21: warning: suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)'
+  sum += sizeof(ArrayC) / sizeof(PtrArray[0]);
+  // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator
+
+  sum += sizeof(PChar);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(PInt);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(PPInt);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(PPMyStruct);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+
+  return sum;
+}
+
+void some_generic_function(const void *arg, int argsize);
+int *IntP, **IntPP;
+C *ClassP, **ClassPP;
+
+void GenericFunctionTest() {
+  // The `sizeof(pointer)` checks ignore situations where the pointer is
+  // produced by dereferencing a pointer-to-pointer, because this is unlikely
+  // to be an accident and can appear in legitimate code that tries to call
+  // a generic function which emulates dynamic typing within C.
+  some_generic_function(IntPP, sizeof(*IntPP));
+  some_generic_function(ClassPP, sizeof(*ClassPP));
+  // Using `...[0]` instead of the dereference operator is another common
+  // variant, which is also widespread in the idiomatic array-size calculation:
+  // `sizeof(array) / sizeof(array[0])`.
+  some_generic_function(IntPP, sizeof(IntPP[0]));
+  some_generic_function(ClassPP, sizeof(ClassPP[0]));
+  // FIXME: There is a third common pattern where the generic function is
+  // called with `&Variable` and `sizeof(Variable)`. Right now these are
+  // reported by the `sizeof(pointer)` checks, but this causes some false
+  // positives, so it would be good to create an exception for them.
+  some_generic_function(&IntPP, sizeof(IntP));
+  // CHECK-MESSAGES: :[[@LINE-1]]:33: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  some_generic_function(&ClassPP, sizeof(ClassP));
+  // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+}
+
+int ValidExpressions() {
+  int A[] = {1, 2, 3, 4};
+  static const char str[] = "hello";
+  static const char* ptr[] { "aaa", "bbb", "ccc" };
+  typedef C *CA10[10];
+  C *PtrArray[10];
+  CA10 PtrArray1;
+
+  int sum = 0;
+  if (sizeof(A) < 10)
+    sum += sizeof(A);
+  sum += sizeof(int);
+  sum += sizeof(AsStruct());
+  sum += sizeof(M{}.AsStruct());
+  sum += sizeof(A[sizeof(A) / sizeof(int)]);
+  // Here the outer sizeof is reported, but the inner ones are accepted:
+  sum += sizeof(&A[sizeof(A) / sizeof(int)]);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+  sum += sizeof(sizeof(0));  // Special case: sizeof size_t.
+  sum += sizeof(void*);
+  sum += sizeof(void const *);
+  sum += sizeof(void const *) / 4;
+  sum += sizeof(str);
+  sum += sizeof(str) / sizeof(char);
+  sum += sizeof(str) / sizeof(str[0]);
+  sum += sizeof(ptr) / sizeof(ptr[0]);
+  sum += sizeof(ptr) / sizeof(*(ptr));
+  sum += sizeof(PtrArray) / sizeof(PtrArray[0]);
+  // Canonical type of PtrArray1 is same as PtrArray.
+  sum = sizeof(PtrArray) / sizeof(PtrArray1[0]);
+  // There is no warning for 'sizeof(T*)/sizeof(Q)' case.
+  sum += sizeof(PtrArray) / sizeof(A[0]);
+  return sum;
+}
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp
index 003a02209c3d2d..064f31cb08c6b3 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/sizeof-expression.cpp
@@ -124,8 +124,6 @@ int Test1(const char* ptr) {
   // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious usage of sizeof pointer 'sizeof(P*)/sizeof(Q*)'
   sum += sizeof(ptr) / sizeof(char);
   // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)'
-  sum += sizeof(ptr) / sizeof(ptr[0]);
-  // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious usage of sizeof pointer 'sizeof(T*)/sizeof(T)'
   sum += sizeof(int) * sizeof(char);
   // CHECK-MESSAGES: :[[@LINE-1]]:22: warning: suspicious 'sizeof' by 'sizeof' multiplication
   sum += sizeof(ptr) * sizeof(ptr[0]);
@@ -207,50 +205,57 @@ int Test5() {
   C *PtrArray[10];
   C *PC;
 
+  char *PChar;
+  int *PInt, **PPInt;
+  MyStruct **PPMyStruct;
+
   int sum = 0;
   sum += sizeof(&S.arr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(&kGlocalMyStruct.arr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(&kGlocalMyStructPtr->arr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(S.arr + 0);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(+ S.arr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof((int*)S.arr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
 
   sum += sizeof(S.ptr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(kGlocalMyStruct.ptr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(kGlocalMyStructPtr->ptr);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
 
   sum += sizeof(&kGlocalMyStruct);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(&S);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(MyStruct*);
   sum += sizeof(PMyStruct);
   sum += sizeof(PS);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(PS2);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(&A10);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(PtrArray) / sizeof(PtrArray[1]);
-  // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:29: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   sum += sizeof(A10) / sizeof(PtrArray[0]);
-  // CHECK-MESSAGES: :[[@LINE-1]]:24: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
   sum += sizeof(PC) / sizeof(PtrArray[0]);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
   // CHECK-MESSAGES: :[[@LINE-2]]:21: warning: suspicious usage of sizeof pointer 'sizeof(T)/sizeof(T)'
-  // CHECK-MESSAGES: :[[@LINE-3]]:23: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
   sum += sizeof(ArrayC) / sizeof(PtrArray[0]);
   // CHECK-MESSAGES: :[[@LINE-1]]:25: warning: suspicious usage of 'sizeof(...)/sizeof(...)'; numerator is not a multiple of denominator
-  // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: suspicious usage of 'sizeof(A*)'; pointer to aggregate
+
+  // These pointers do not point to aggregate types, so they are not reported in this mode:
+  sum += sizeof(PChar);
+  sum += sizeof(PInt);
+  sum += sizeof(PPInt);
+  sum += sizeof(PPMyStruct);
 
   return sum;
 }
@@ -293,6 +298,32 @@ bool Baz() { return sizeof(A) < N; }
 // CHECK-MESSAGES: :[[@LINE-1]]:31: warning: suspicious comparison of 'sizeof(expr)' to a constant
 bool Test7() { return Baz<-1>(); }
 
+void some_generic_function(const void *arg, int argsize);
+int *IntP, **IntPP;
+C *ClassP, **ClassPP;
+
+void GenericFunctionTest() {
+  // The `sizeof(pointer)` checks ignore situations where the pointer is
+  // produced by dereferencing a pointer-to-pointer, because this is unlikely
+  // to be an accident and can appear in legitimate code that tries to call
+  // a generic function which emulates dynamic typing within C.
+  some_generic_function(IntPP, sizeof(*IntPP));
+  some_generic_function(ClassPP, sizeof(*ClassPP));
+  // Using `...[0]` instead of the dereference operator is another common
+  // variant, which is also widespread in the idiomatic array-size calculation:
+  // `sizeof(array) / sizeof(array[0])`.
+  some_generic_function(IntPP, sizeof(IntPP[0]));
+  some_generic_function(ClassPP, sizeof(ClassPP[0]));
+  // FIXME: There is a third common pattern where the generic function is
+  // called with `&Variable` and `sizeof(Variable)`. Right now these are
+  // reported by the `sizeof(pointer)` checks, but this causes some false
+  // positives, so it would be good to create an exception for them.
+  // NOTE: `sizeof(IntP)` is only reported with `WarnOnSizeOfPointer=true`.
+  some_generic_function(&IntPP, sizeof(IntP));
+  some_generic_function(&ClassPP, sizeof(ClassP));
+  // CHECK-MESSAGES: :[[@LINE-1]]:35: warning: suspicious usage of 'sizeof()' on an expression that results in a pointer
+}
+
 int ValidExpressions() {
   int A[] = {1, 2, 3, 4};
   static const char str[] = "hello";
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp
index e10ac3f46e2e9d..d5ea96b00254c2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/include-cleaner.cpp
@@ -15,3 +15,11 @@ std::string HelloString;
 // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: no header providing "std::string" is directly included [misc-include-cleaner]
 int FooBarResult = foobar();
 // CHECK-MESSAGES: :[[@LINE-1]]:20: warning: no header providing "foobar" is directly included [misc-include-cleaner]
+
+namespace valid {
+
+namespace gh93335 {
+void log2() {}
+} // namespace gh93335
+
+} // namespace valid
diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 3717a172e8627f..ec370806683c9f 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -375,7 +375,9 @@ if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pedantic -Wno-long-long")
   endif ()
 
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" )
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nested-anon-types" )
+  endif ()
 endif ()
 
 # Determine HOST_LINK_VERSION on Darwin.
diff --git a/clang/docs/tools/clang-formatted-files.txt b/clang/docs/tools/clang-formatted-files.txt
index dee51e402b687f..4866bd4aee634f 100644
--- a/clang/docs/tools/clang-formatted-files.txt
+++ b/clang/docs/tools/clang-formatted-files.txt
@@ -622,6 +622,7 @@ clang/tools/libclang/CXCursor.h
 clang/tools/scan-build-py/tests/functional/src/include/clean-one.h
 clang/unittests/Analysis/CFGBuildResult.h
 clang/unittests/Analysis/MacroExpansionContextTest.cpp
+clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp
 clang/unittests/Analysis/FlowSensitive/CNFFormula.cpp
 clang/unittests/Analysis/FlowSensitive/DataflowAnalysisContextTest.cpp
 clang/unittests/Analysis/FlowSensitive/DataflowEnvironmentTest.cpp
diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 8bce4812f0d482..a1d1d1c51cd417 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -3203,6 +3203,9 @@ class ASTContext : public RefCountedBase<ASTContext> {
   /// valid feature names.
   ParsedTargetAttr filterFunctionTargetAttrs(const TargetAttr *TD) const;
 
+  std::vector<std::string>
+  filterFunctionTargetVersionAttrs(const TargetVersionAttr *TV) const;
+
   void getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
                              const FunctionDecl *) const;
   void getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
diff --git a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
index 7bdb9052e57e74..e99c5b2466334a 100644
--- a/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
+++ b/clang/include/clang/Analysis/Analyses/ThreadSafetyCommon.h
@@ -330,9 +330,9 @@ class CapabilityExpr {
 
   bool shouldIgnore() const { return sexpr() == nullptr; }
 
-  bool isInvalid() const { return sexpr() && isa<til::Undefined>(sexpr()); }
+  bool isInvalid() const { return isa_and_nonnull<til::Undefined>(sexpr()); }
 
-  bool isUniversal() const { return sexpr() && isa<til::Wildcard>(sexpr()); }
+  bool isUniversal() const { return isa_and_nonnull<til::Wildcard>(sexpr()); }
 };
 
 // Translate clang::Expr to til::SExpr.
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index 11982af3fa609b..7bef5fd7ad40f2 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -482,11 +482,11 @@ def SqrtF16F128 : Builtin, F16F128MathTemplate {
   let Prototype = "T(T)";
 }
 
-def TanF128 : Builtin {
-  let Spellings = ["__builtin_tanf128"];
+def TanF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_tan"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
                     ConstIgnoringErrnoAndExceptions];
-  let Prototype = "__float128(__float128)";
+  let Prototype = "T(T)";
 }
 
 def TanhF128 : Builtin {
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index b762e44e755ec4..53f410d3cb4bde 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -96,6 +96,7 @@ FEATURE(nullability, true)
 FEATURE(nullability_on_arrays, true)
 FEATURE(nullability_on_classes, true)
 FEATURE(nullability_nullable_result, true)
+FEATURE(numerical_stability_sanitizer, LangOpts.Sanitize.has(SanitizerKind::NumericalStability))
 FEATURE(memory_sanitizer,
         LangOpts.Sanitize.hasOneOf(SanitizerKind::Memory |
                                    SanitizerKind::KernelMemory))
diff --git a/clang/include/clang/Basic/Sanitizers.def b/clang/include/clang/Basic/Sanitizers.def
index b228ffd07ee745..bee35e9dca7c39 100644
--- a/clang/include/clang/Basic/Sanitizers.def
+++ b/clang/include/clang/Basic/Sanitizers.def
@@ -76,6 +76,9 @@ SANITIZER("fuzzer-no-link", FuzzerNoLink)
 // ThreadSanitizer
 SANITIZER("thread", Thread)
 
+// Numerical stability sanitizer.
+SANITIZER("numerical", NumericalStability)
+
 // LeakSanitizer
 SANITIZER("leak", Leak)
 
diff --git a/clang/include/clang/Driver/SanitizerArgs.h b/clang/include/clang/Driver/SanitizerArgs.h
index 07070ec4fc0653..47ef175302679f 100644
--- a/clang/include/clang/Driver/SanitizerArgs.h
+++ b/clang/include/clang/Driver/SanitizerArgs.h
@@ -103,6 +103,9 @@ class SanitizerArgs {
   bool needsCfiDiagRt() const;
   bool needsStatsRt() const { return Stats; }
   bool needsScudoRt() const { return Sanitizers.has(SanitizerKind::Scudo); }
+  bool needsNsanRt() const {
+    return Sanitizers.has(SanitizerKind::NumericalStability);
+  }
 
   bool hasMemTag() const {
     return hasMemtagHeap() || hasMemtagStack() || hasMemtagGlobals();
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index c0850a8fa9f7f8..9b1628d2d86f9e 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -1360,7 +1360,7 @@ class Preprocessor {
 
     MacroState &S = CurSubmoduleState->Macros[II];
     auto *MD = S.getLatest();
-    while (MD && isa<VisibilityMacroDirective>(MD))
+    while (isa_and_nonnull<VisibilityMacroDirective>(MD))
       MD = MD->getPrevious();
     return MacroDefinition(dyn_cast_or_null<DefMacroDirective>(MD),
                            S.getActiveModuleMacros(*this, II),
diff --git a/clang/include/clang/Sema/SemaObjC.h b/clang/include/clang/Sema/SemaObjC.h
index 91430797e5ed82..bb8887691ce5d3 100644
--- a/clang/include/clang/Sema/SemaObjC.h
+++ b/clang/include/clang/Sema/SemaObjC.h
@@ -383,7 +383,7 @@ class SemaObjC : public SemaBase {
   void AddAnyMethodToGlobalPool(Decl *D);
 
   void ActOnStartOfObjCMethodDef(Scope *S, Decl *D);
-  bool isObjCMethodDecl(Decl *D) { return D && isa<ObjCMethodDecl>(D); }
+  bool isObjCMethodDecl(Decl *D) { return isa_and_nonnull<ObjCMethodDecl>(D); }
 
   /// CheckImplementationIvars - This routine checks if the instance variables
   /// listed in the implelementation match those listed in the interface.
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
index 151d3e57c1cb81..59805d01be5db7 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/MemRegion.h
@@ -781,7 +781,7 @@ class SymbolicRegion : public SubRegion {
       : SubRegion(sreg, SymbolicRegionKind), sym(s) {
     // Because pointer arithmetic is represented by ElementRegion layers,
     // the base symbol here should not contain any arithmetic.
-    assert(s && isa<SymbolData>(s));
+    assert(isa_and_nonnull<SymbolData>(s));
     assert(s->getType()->isAnyPointerType() ||
            s->getType()->isReferenceType() ||
            s->getType()->isBlockPointerType());
diff --git a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
index 1e6354f71e294a..7390ea17c8a4b6 100644
--- a/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
+++ b/clang/lib/ARCMigrate/TransUnbridgedCasts.cpp
@@ -371,7 +371,7 @@ class UnbridgedCastRewriter : public RecursiveASTVisitor<UnbridgedCastRewriter>{
       Stmt *parent = E;
       do {
         parent = StmtMap->getParentIgnoreParenImpCasts(parent);
-      } while (parent && isa<FullExpr>(parent));
+      } while (isa_and_nonnull<FullExpr>(parent));
 
       if (ReturnStmt *retS = dyn_cast_or_null<ReturnStmt>(parent)) {
         std::string note = "remove the cast and change return type of function "
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index c74c78399530a4..b40e80c935167f 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -87,7 +87,6 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/AArch64TargetParser.h"
 #include "llvm/TargetParser/Triple.h"
 #include <algorithm>
 #include <cassert>
@@ -13665,20 +13664,17 @@ QualType ASTContext::getCorrespondingSignedFixedPointType(QualType Ty) const {
   }
 }
 
-// Given a list of FMV features, return a concatenated list of the
-// corresponding backend features (which may contain duplicates).
-static std::vector<std::string> getFMVBackendFeaturesFor(
-    const llvm::SmallVectorImpl<StringRef> &FMVFeatStrings) {
-  std::vector<std::string> BackendFeats;
-  for (StringRef F : FMVFeatStrings) {
-    if (auto FMVExt = llvm::AArch64::parseArchExtension(F)) {
-      SmallVector<StringRef, 8> Feats;
-      FMVExt->DependentFeatures.split(Feats, ',', -1, false);
-      for (StringRef F : Feats)
-        BackendFeats.push_back(F.str());
-    }
-  }
-  return BackendFeats;
+std::vector<std::string> ASTContext::filterFunctionTargetVersionAttrs(
+    const TargetVersionAttr *TV) const {
+  assert(TV != nullptr);
+  llvm::SmallVector<StringRef, 8> Feats;
+  std::vector<std::string> ResFeats;
+  TV->getFeatures(Feats);
+  for (auto &Feature : Feats)
+    if (Target->validateCpuSupports(Feature.str()))
+      // Use '?' to mark features that came from TargetVersion.
+      ResFeats.push_back("?" + Feature.str());
+  return ResFeats;
 }
 
 ParsedTargetAttr
@@ -13713,12 +13709,10 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
 
     // Make a copy of the features as passed on the command line into the
     // beginning of the additional features from the function to override.
-    // AArch64 handles command line option features in parseTargetAttr().
-    if (!Target->getTriple().isAArch64())
-      ParsedAttr.Features.insert(
-          ParsedAttr.Features.begin(),
-          Target->getTargetOpts().FeaturesAsWritten.begin(),
-          Target->getTargetOpts().FeaturesAsWritten.end());
+    ParsedAttr.Features.insert(
+        ParsedAttr.Features.begin(),
+        Target->getTargetOpts().FeaturesAsWritten.begin(),
+        Target->getTargetOpts().FeaturesAsWritten.end());
 
     if (ParsedAttr.CPU != "" && Target->isValidCPUName(ParsedAttr.CPU))
       TargetCPU = ParsedAttr.CPU;
@@ -13739,31 +13733,32 @@ void ASTContext::getFunctionFeatureMap(llvm::StringMap<bool> &FeatureMap,
                     Target->getTargetOpts().FeaturesAsWritten.end());
     Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
   } else if (const auto *TC = FD->getAttr<TargetClonesAttr>()) {
+    std::vector<std::string> Features;
     if (Target->getTriple().isAArch64()) {
+      // TargetClones for AArch64
       llvm::SmallVector<StringRef, 8> Feats;
       TC->getFeatures(Feats, GD.getMultiVersionIndex());
-      std::vector<std::string> Features = getFMVBackendFeaturesFor(Feats);
+      for (StringRef Feat : Feats)
+        if (Target->validateCpuSupports(Feat.str()))
+          // Use '?' to mark features that came from AArch64 TargetClones.
+          Features.push_back("?" + Feat.str());
       Features.insert(Features.begin(),
                       Target->getTargetOpts().FeaturesAsWritten.begin(),
                       Target->getTargetOpts().FeaturesAsWritten.end());
-      Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
     } else {
-      std::vector<std::string> Features;
       StringRef VersionStr = TC->getFeatureStr(GD.getMultiVersionIndex());
       if (VersionStr.starts_with("arch="))
         TargetCPU = VersionStr.drop_front(sizeof("arch=") - 1);
       else if (VersionStr != "default")
         Features.push_back((StringRef{"+"} + VersionStr).str());
-      Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
     }
-  } else if (const auto *TV = FD->getAttr<TargetVersionAttr>()) {
-    llvm::SmallVector<StringRef, 8> Feats;
-    TV->getFeatures(Feats);
-    std::vector<std::string> Features = getFMVBackendFeaturesFor(Feats);
-    Features.insert(Features.begin(),
-                    Target->getTargetOpts().FeaturesAsWritten.begin(),
-                    Target->getTargetOpts().FeaturesAsWritten.end());
     Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Features);
+  } else if (const auto *TV = FD->getAttr<TargetVersionAttr>()) {
+    std::vector<std::string> Feats = filterFunctionTargetVersionAttrs(TV);
+    Feats.insert(Feats.begin(),
+                 Target->getTargetOpts().FeaturesAsWritten.begin(),
+                 Target->getTargetOpts().FeaturesAsWritten.end());
+    Target->initFeatureMap(FeatureMap, getDiagnostics(), TargetCPU, Feats);
   } else {
     FeatureMap = Target->getTargetOpts().FeatureMap;
   }
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 3b9080e09b3313..02cd4ed9a6cace 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1505,7 +1505,7 @@ ExpectedType ASTNodeImporter::VisitInjectedClassNameType(
   // The InjectedClassNameType is created in VisitRecordDecl when the
   // T->getDecl() is imported. Here we can return the existing type.
   const Type *Ty = (*ToDeclOrErr)->getTypeForDecl();
-  assert(Ty && isa<InjectedClassNameType>(Ty));
+  assert(isa_and_nonnull<InjectedClassNameType>(Ty));
   return QualType(Ty, 0);
 }
 
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index 0328666d59b1fc..a5d3dacfc1a84e 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -139,6 +139,4 @@ add_clang_library(clangAST
   omp_gen
   ClangDriverOptions
   intrinsics_gen
-  # These generated headers are included transitively.
-  AArch64TargetParserTableGen
   )
diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp
index 5507481e6cefc8..d94533598438cc 100644
--- a/clang/lib/AST/DeclBase.cpp
+++ b/clang/lib/AST/DeclBase.cpp
@@ -1116,7 +1116,7 @@ bool Decl::isInExportDeclContext() const {
   while (DC && !isa<ExportDecl>(DC))
     DC = DC->getLexicalParent();
 
-  return DC && isa<ExportDecl>(DC);
+  return isa_and_nonnull<ExportDecl>(DC);
 }
 
 bool Decl::isInAnotherModuleUnit() const {
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index c0ceee142a9cf0..82cccc4018fc91 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -837,7 +837,7 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
     typedef SmallVector<const ClassTemplateSpecializationDecl *, 8> SpecsTy;
     SpecsTy Specs;
     const DeclContext *Ctx = FD->getDeclContext();
-    while (Ctx && isa<NamedDecl>(Ctx)) {
+    while (isa_and_nonnull<NamedDecl>(Ctx)) {
       const ClassTemplateSpecializationDecl *Spec
                                = dyn_cast<ClassTemplateSpecializationDecl>(Ctx);
       if (Spec && !Spec->isExplicitSpecialization())
@@ -3067,7 +3067,7 @@ Expr *Expr::IgnoreParenCasts() {
 
 Expr *Expr::IgnoreConversionOperatorSingleStep() {
   if (auto *MCE = dyn_cast<CXXMemberCallExpr>(this)) {
-    if (MCE->getMethodDecl() && isa<CXXConversionDecl>(MCE->getMethodDecl()))
+    if (isa_and_nonnull<CXXConversionDecl>(MCE->getMethodDecl()))
       return MCE->getImplicitObjectArgument();
   }
   return this;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 86fb396fabe2d9..d5057452cec9c5 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -2130,7 +2130,7 @@ static bool IsWeakLValue(const LValue &Value) {
 
 static bool isZeroSized(const LValue &Value) {
   const ValueDecl *Decl = GetLValueBaseDecl(Value);
-  if (Decl && isa<VarDecl>(Decl)) {
+  if (isa_and_nonnull<VarDecl>(Decl)) {
     QualType Ty = Decl->getType();
     if (Ty->isArrayType())
       return Ty->isIncompleteType() ||
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 6654a27c921689..0385ca4b3a0639 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -318,7 +318,8 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     if (DiscardResult)
       return this->discard(SubExpr);
 
-    std::optional<PrimType> FromT = classify(SubExpr->getType());
+    QualType SubExprTy = SubExpr->getType();
+    std::optional<PrimType> FromT = classify(SubExprTy);
     std::optional<PrimType> ToT = classify(CE->getType());
     if (!FromT || !ToT)
       return false;
@@ -326,9 +327,14 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     assert(isPtrType(*FromT));
     assert(isPtrType(*ToT));
     if (FromT == ToT) {
-      if (SubExpr->getType()->isVoidPointerType())
-        return this->visit(SubExpr) && this->emitVoidPtrCast(CE);
-      return this->delegate(SubExpr);
+      if (CE->getType()->isVoidPointerType())
+        return this->delegate(SubExpr);
+
+      if (!this->visit(SubExpr))
+        return false;
+      if (FromT == PT_Ptr)
+        return this->emitPtrPtrCast(SubExprTy->isVoidPointerType(), CE);
+      return true;
     }
 
     if (!this->visit(SubExpr))
@@ -1682,6 +1688,17 @@ bool ByteCodeExprGen<Emitter>::VisitObjCStringLiteral(
   return this->delegate(E->getString());
 }
 
+template <class Emitter>
+bool ByteCodeExprGen<Emitter>::VisitObjCEncodeExpr(const ObjCEncodeExpr *E) {
+  auto &A = Ctx.getASTContext();
+  std::string Str;
+  A.getObjCEncodingForType(E->getEncodedType(), Str);
+  StringLiteral *SL =
+      StringLiteral::Create(A, Str, StringLiteralKind::Ordinary,
+                            /*Pascal=*/false, E->getType(), E->getAtLoc());
+  return this->delegate(SL);
+}
+
 template <class Emitter>
 bool ByteCodeExprGen<Emitter>::VisitSYCLUniqueStableNameExpr(
     const SYCLUniqueStableNameExpr *E) {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 7ab14b6ab383e8..295cfef0525cd8 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -91,6 +91,7 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   bool VisitAbstractConditionalOperator(const AbstractConditionalOperator *E);
   bool VisitStringLiteral(const StringLiteral *E);
   bool VisitObjCStringLiteral(const ObjCStringLiteral *E);
+  bool VisitObjCEncodeExpr(const ObjCEncodeExpr *E);
   bool VisitSYCLUniqueStableNameExpr(const SYCLUniqueStableNameExpr *E);
   bool VisitCharacterLiteral(const CharacterLiteral *E);
   bool VisitCompoundAssignOperator(const CompoundAssignOperator *E);
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 0ad710c5ec1afc..784e138e1467d4 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -1980,10 +1980,25 @@ static inline bool CastPointerIntegralAPS(InterpState &S, CodePtr OpPC,
   return true;
 }
 
-static inline bool VoidPtrCast(InterpState &S, CodePtr OpPC) {
-  const SourceInfo &E = S.Current->getSource(OpPC);
-  S.CCEDiag(E, diag::note_constexpr_invalid_cast)
-      << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
+static inline bool PtrPtrCast(InterpState &S, CodePtr OpPC, bool SrcIsVoidPtr) {
+  const auto &Ptr = S.Stk.peek<Pointer>();
+
+  if (SrcIsVoidPtr && S.getLangOpts().CPlusPlus) {
+    bool HasValidResult = !Ptr.isZero();
+
+    if (HasValidResult) {
+      // FIXME: note_constexpr_invalid_void_star_cast
+    } else if (!S.getLangOpts().CPlusPlus26) {
+      const SourceInfo &E = S.Current->getSource(OpPC);
+      S.CCEDiag(E, diag::note_constexpr_invalid_cast)
+          << 3 << "'void *'" << S.Current->getRange(OpPC);
+    }
+  } else {
+    const SourceInfo &E = S.Current->getSource(OpPC);
+    S.CCEDiag(E, diag::note_constexpr_invalid_cast)
+        << 2 << S.getLangOpts().CPlusPlus << S.Current->getRange(OpPC);
+  }
+
   return true;
 }
 
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index ac5426c87c2123..df362efd8b58b2 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -139,7 +139,6 @@ class AluOpcode : Opcode {
 }
 
 class FloatOpcode : Opcode {
-  let Types = [];
   let Args = [ArgRoundingMode];
 }
 
@@ -195,17 +194,14 @@ def NoRet : Opcode {}
 
 def Call : Opcode {
   let Args = [ArgFunction, ArgUint32];
-  let Types = [];
 }
 
 def CallVirt : Opcode {
   let Args = [ArgFunction, ArgUint32];
-  let Types = [];
 }
 
 def CallBI : Opcode {
   let Args = [ArgFunction, ArgCallExpr];
-  let Types = [];
 }
 
 def CallPtr : Opcode {
@@ -214,7 +210,6 @@ def CallPtr : Opcode {
 
 def CallVar : Opcode {
   let Args = [ArgFunction, ArgUint32];
-  let Types = [];
 }
 
 def OffsetOf : Opcode {
@@ -399,8 +394,6 @@ def InitGlobalTemp : AccessOpcode {
 // [Pointer] -> [Pointer]
 def InitGlobalTempComp : Opcode {
   let Args = [ArgLETD];
-  let Types = [];
-  let HasGroup = 0;
 }
 // [Value] -> []
 def SetGlobal : AccessOpcode;
@@ -505,13 +498,9 @@ def SubPtr : Opcode {
 }
 
 // [Pointer] -> [Pointer]
-def IncPtr : Opcode {
-  let HasGroup = 0;
-}
+def IncPtr : Opcode;
 // [Pointer] -> [Pointer]
-def DecPtr : Opcode {
-  let HasGroup = 0;
-}
+def DecPtr : Opcode;
 
 //===----------------------------------------------------------------------===//
 // Function pointers.
@@ -607,7 +596,6 @@ def Cast: Opcode {
 }
 
 def CastFP : Opcode {
-  let Types = [];
   let Args = [ArgFltSemantics, ArgRoundingMode];
 }
 
@@ -642,12 +630,10 @@ def CastFloatingIntegral : Opcode {
 }
 
 def CastFloatingIntegralAP : Opcode {
-  let Types = [];
   let Args = [ArgUint32];
 }
 
 def CastFloatingIntegralAPS : Opcode {
-  let Types = [];
   let Args = [ArgUint32];
 }
 
@@ -656,16 +642,15 @@ def CastPointerIntegral : Opcode {
   let HasGroup = 1;
 }
 def CastPointerIntegralAP : Opcode {
-  let Types = [];
-  let HasGroup = 0;
   let Args = [ArgUint32];
 }
 def CastPointerIntegralAPS : Opcode {
-  let Types = [];
-  let HasGroup = 0;
   let Args = [ArgUint32];
 }
-def VoidPtrCast : Opcode;
+def PtrPtrCast : Opcode {
+  let Args = [ArgBool];
+
+}
 
 def DecayPtr : Opcode {
   let Types = [PtrTypeClass, PtrTypeClass];
diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp
index 30cff1ba2e6f37..4fbf0e3b42dbc8 100644
--- a/clang/lib/AST/Mangle.cpp
+++ b/clang/lib/AST/Mangle.cpp
@@ -301,9 +301,8 @@ void MangleContext::mangleBlock(const DeclContext *DC, const BlockDecl *BD,
   } else {
     assert((isa<NamedDecl>(DC) || isa<BlockDecl>(DC)) &&
            "expected a NamedDecl or BlockDecl");
-    if (isa<BlockDecl>(DC))
-      for (; DC && isa<BlockDecl>(DC); DC = DC->getParent())
-        (void) getBlockId(cast<BlockDecl>(DC), true);
+    for (; isa_and_nonnull<BlockDecl>(DC); DC = DC->getParent())
+      (void)getBlockId(cast<BlockDecl>(DC), true);
     assert((isa<TranslationUnitDecl>(DC) || isa<NamedDecl>(DC)) &&
            "expected a TranslationUnitDecl or a NamedDecl");
     if (const auto *CD = dyn_cast<CXXConstructorDecl>(DC))
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 36d611750ca48c..ffc5d2d4cd8fc3 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -899,6 +899,8 @@ void MicrosoftCXXNameMangler::mangleFloat(llvm::APFloat Number) {
   case APFloat::S_Float8E4M3FNUZ:
   case APFloat::S_Float8E4M3B11FNUZ:
   case APFloat::S_FloatTF32:
+  case APFloat::S_Float6E3M2FN:
+  case APFloat::S_Float6E2M3FN:
     llvm_unreachable("Tried to mangle unexpected APFloat semantics");
   }
 
@@ -2748,7 +2750,7 @@ void MicrosoftCXXNameMangler::mangleFunctionType(const FunctionType *T,
       return;
     }
     Out << '@';
-  } else if (IsInLambda && D && isa<CXXConversionDecl>(D)) {
+  } else if (IsInLambda && isa_and_nonnull<CXXConversionDecl>(D)) {
     // The only lambda conversion operators are to function pointers, which
     // can differ by their calling convention and are typically deduced.  So
     // we make sure that this type gets mangled properly.
diff --git a/clang/lib/AST/ParentMap.cpp b/clang/lib/AST/ParentMap.cpp
index 3d6a1cc84c7b10..e97cb5e226f5c2 100644
--- a/clang/lib/AST/ParentMap.cpp
+++ b/clang/lib/AST/ParentMap.cpp
@@ -139,7 +139,9 @@ Stmt* ParentMap::getParent(Stmt* S) const {
 }
 
 Stmt *ParentMap::getParentIgnoreParens(Stmt *S) const {
-  do { S = getParent(S); } while (S && isa<ParenExpr>(S));
+  do {
+    S = getParent(S);
+  } while (isa_and_nonnull<ParenExpr>(S));
   return S;
 }
 
@@ -155,7 +157,8 @@ Stmt *ParentMap::getParentIgnoreParenCasts(Stmt *S) const {
 Stmt *ParentMap::getParentIgnoreParenImpCasts(Stmt *S) const {
   do {
     S = getParent(S);
-  } while (S && isa<Expr>(S) && cast<Expr>(S)->IgnoreParenImpCasts() != S);
+  } while (isa_and_nonnull<Expr>(S) &&
+           cast<Expr>(S)->IgnoreParenImpCasts() != S);
 
   return S;
 }
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 7e030e05512690..8f51d16b5db037 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -84,7 +84,7 @@ namespace {
 
     void PrintStmt(Stmt *S, int SubIndent) {
       IndentLevel += SubIndent;
-      if (S && isa<Expr>(S)) {
+      if (isa_and_nonnull<Expr>(S)) {
         // If this is an expr used in a stmt context, indent and newline it.
         Indent();
         Visit(S);
@@ -1939,7 +1939,7 @@ void StmtPrinter::VisitCXXOperatorCallExpr(CXXOperatorCallExpr *Node) {
 void StmtPrinter::VisitCXXMemberCallExpr(CXXMemberCallExpr *Node) {
   // If we have a conversion operator call only print the argument.
   CXXMethodDecl *MD = Node->getMethodDecl();
-  if (MD && isa<CXXConversionDecl>(MD)) {
+  if (isa_and_nonnull<CXXConversionDecl>(MD)) {
     PrintExpr(Node->getImplicitObjectArgument());
     return;
   }
diff --git a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
index 38b5f51b7b2f02..27d42a7b508562 100644
--- a/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
+++ b/clang/lib/Analysis/FlowSensitive/ASTOps.cpp
@@ -100,7 +100,8 @@ getFieldsForInitListExpr(const InitListT *InitList) {
   std::vector<const FieldDecl *> Fields;
 
   if (InitList->getType()->isUnionType()) {
-    Fields.push_back(InitList->getInitializedFieldInUnion());
+    if (const FieldDecl *Field = InitList->getInitializedFieldInUnion())
+      Fields.push_back(Field);
     return Fields;
   }
 
@@ -137,9 +138,11 @@ RecordInitListHelper::RecordInitListHelper(
   // it doesn't do this -- so we create an `ImplicitValueInitExpr` ourselves.
   SmallVector<Expr *> InitsForUnion;
   if (Ty->isUnionType() && Inits.empty()) {
-    assert(Fields.size() == 1);
-    ImplicitValueInitForUnion.emplace(Fields.front()->getType());
-    InitsForUnion.push_back(&*ImplicitValueInitForUnion);
+    assert(Fields.size() <= 1);
+    if (!Fields.empty()) {
+      ImplicitValueInitForUnion.emplace(Fields.front()->getType());
+      InitsForUnion.push_back(&*ImplicitValueInitForUnion);
+    }
     Inits = InitsForUnion;
   }
 
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 0d7967c8b93449..7c88917faf9c65 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -415,7 +415,7 @@ class ResultObjectVisitor : public AnalysisASTVisitor<ResultObjectVisitor> {
     // below them can initialize the same object (or part of it).
     if (isa<CXXConstructExpr>(E) || isa<CallExpr>(E) || isa<LambdaExpr>(E) ||
         isa<CXXDefaultArgExpr>(E) || isa<CXXDefaultInitExpr>(E) ||
-        isa<CXXStdInitializerListExpr>(E) ||
+        isa<CXXStdInitializerListExpr>(E) || isa<AtomicExpr>(E) ||
         // We treat `BuiltinBitCastExpr` as an "original initializer" too as
         // it may not even be casting from a record type -- and even if it is,
         // the two objects are in general of unrelated type.
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 7f256b0933cf25..2884baecb78985 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -1051,18 +1051,57 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
   return true;
 }
 
+bool AArch64TargetInfo::initFeatureMap(
+    llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags, StringRef CPU,
+    const std::vector<std::string> &FeaturesVec) const {
+  std::vector<std::string> UpdatedFeaturesVec;
+  // Parse the CPU and add any implied features.
+  std::optional<llvm::AArch64::CpuInfo> CpuInfo = llvm::AArch64::parseCpu(CPU);
+  if (CpuInfo) {
+    auto Exts = CpuInfo->getImpliedExtensions();
+    std::vector<StringRef> CPUFeats;
+    llvm::AArch64::getExtensionFeatures(Exts, CPUFeats);
+    for (auto F : CPUFeats) {
+      assert((F[0] == '+' || F[0] == '-') && "Expected +/- in target feature!");
+      UpdatedFeaturesVec.push_back(F.str());
+    }
+  }
+
+  // Process target and dependent features. This is done in two loops collecting
+  // them into UpdatedFeaturesVec: first to add dependent '+'features, second to
+  // add target '+/-'features that can later disable some of features added on
+  // the first loop. Function Multi Versioning features begin with '?'.
+  for (const auto &Feature : FeaturesVec)
+    if (((Feature[0] == '?' || Feature[0] == '+')) &&
+        AArch64TargetInfo::doesFeatureAffectCodeGen(Feature.substr(1))) {
+      StringRef DepFeatures =
+          AArch64TargetInfo::getFeatureDependencies(Feature.substr(1));
+      SmallVector<StringRef, 1> AttrFeatures;
+      DepFeatures.split(AttrFeatures, ",");
+      for (auto F : AttrFeatures)
+        UpdatedFeaturesVec.push_back(F.str());
+    }
+  for (const auto &Feature : FeaturesVec)
+    if (Feature[0] != '?') {
+      std::string UpdatedFeature = Feature;
+      if (Feature[0] == '+') {
+        std::optional<llvm::AArch64::ExtensionInfo> Extension =
+            llvm::AArch64::parseArchExtension(Feature.substr(1));
+        if (Extension)
+          UpdatedFeature = Extension->Feature.str();
+      }
+      UpdatedFeaturesVec.push_back(UpdatedFeature);
+    }
+
+  return TargetInfo::initFeatureMap(Features, Diags, CPU, UpdatedFeaturesVec);
+}
+
 // Parse AArch64 Target attributes, which are a comma separated list of:
 //  "arch=<arch>" - parsed to features as per -march=..
 //  "cpu=<cpu>" - parsed to features as per -mcpu=.., with CPU set to <cpu>
 //  "tune=<cpu>" - TuneCPU set to <cpu>
 //  "feature", "no-feature" - Add (or remove) feature.
 //  "+feature", "+nofeature" - Add (or remove) feature.
-//
-// A feature may correspond to an Extension (anything with a corresponding
-// AEK_), in which case an ExtensionSet is used to parse it and expand its
-// dependencies. Otherwise the feature is passed through (e.g. +v8.1a,
-// +outline-atomics, -fmv, etc). Features coming from the command line are
-// already parsed, therefore their dependencies do not need expansion.
 ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
   ParsedTargetAttr Ret;
   if (Features == "default")
@@ -1072,26 +1111,23 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
   bool FoundArch = false;
 
   auto SplitAndAddFeatures = [](StringRef FeatString,
-                                std::vector<std::string> &Features,
-                                llvm::AArch64::ExtensionSet &FeatureBits) {
+                                std::vector<std::string> &Features) {
     SmallVector<StringRef, 8> SplitFeatures;
     FeatString.split(SplitFeatures, StringRef("+"), -1, false);
     for (StringRef Feature : SplitFeatures) {
-      if (FeatureBits.parseModifier(Feature, /* AllowNoDashForm = */ true))
-        continue;
-      // Pass through features that are not extensions, e.g. +v8.1a,
-      // +outline-atomics, -fmv, etc.
-      if (Feature.starts_with("no"))
-        Features.push_back("-" + Feature.drop_front(2).str());
+      StringRef FeatureName = llvm::AArch64::getArchExtFeature(Feature);
+      if (!FeatureName.empty())
+        Features.push_back(FeatureName.str());
       else
-        Features.push_back("+" + Feature.str());
+        // Pushing the original feature string to give a sema error later on
+        // when they get checked.
+        if (Feature.starts_with("no"))
+          Features.push_back("-" + Feature.drop_front(2).str());
+        else
+          Features.push_back("+" + Feature.str());
     }
   };
 
-  llvm::AArch64::ExtensionSet FeatureBits;
-  // Reconstruct the bitset from the command line option features.
-  FeatureBits.reconstructFromParsedFeatures(getTargetOpts().FeaturesAsWritten);
-
   for (auto &Feature : AttrFeatures) {
     Feature = Feature.trim();
     if (Feature.starts_with("fpmath="))
@@ -1114,9 +1150,9 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
       // Ret.Features.
       if (!AI)
         continue;
-      FeatureBits.addArchDefaults(*AI);
+      Ret.Features.push_back(AI->ArchFeature.str());
       // Add any extra features, after the +
-      SplitAndAddFeatures(Split.second, Ret.Features, FeatureBits);
+      SplitAndAddFeatures(Split.second, Ret.Features);
     } else if (Feature.starts_with("cpu=")) {
       if (!Ret.CPU.empty())
         Ret.Duplicate = "cpu=";
@@ -1126,10 +1162,7 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
         std::pair<StringRef, StringRef> Split =
             Feature.split("=").second.trim().split("+");
         Ret.CPU = Split.first;
-        if (auto CpuInfo = llvm::AArch64::parseCpu(Ret.CPU)) {
-          FeatureBits.addCPUDefaults(*CpuInfo);
-          SplitAndAddFeatures(Split.second, Ret.Features, FeatureBits);
-        }
+        SplitAndAddFeatures(Split.second, Ret.Features);
       }
     } else if (Feature.starts_with("tune=")) {
       if (!Ret.Tune.empty())
@@ -1137,19 +1170,25 @@ ParsedTargetAttr AArch64TargetInfo::parseTargetAttr(StringRef Features) const {
       else
         Ret.Tune = Feature.split("=").second.trim();
     } else if (Feature.starts_with("+")) {
-      SplitAndAddFeatures(Feature, Ret.Features, FeatureBits);
+      SplitAndAddFeatures(Feature, Ret.Features);
+    } else if (Feature.starts_with("no-")) {
+      StringRef FeatureName =
+          llvm::AArch64::getArchExtFeature(Feature.split("-").second);
+      if (!FeatureName.empty())
+        Ret.Features.push_back("-" + FeatureName.drop_front(1).str());
+      else
+        Ret.Features.push_back("-" + Feature.split("-").second.str());
     } else {
-      if (FeatureBits.parseModifier(Feature, /* AllowNoDashForm = */ true))
-        continue;
-      // Pass through features that are not extensions, e.g. +v8.1a,
-      // +outline-atomics, -fmv, etc.
-      if (Feature.starts_with("no-"))
-        Ret.Features.push_back("-" + Feature.drop_front(3).str());
+      // Try parsing the string to the internal target feature name. If it is
+      // invalid, add the original string (which could already be an internal
+      // name). These should be checked later by isValidFeatureName.
+      StringRef FeatureName = llvm::AArch64::getArchExtFeature(Feature);
+      if (!FeatureName.empty())
+        Ret.Features.push_back(FeatureName.str());
       else
         Ret.Features.push_back("+" + Feature.str());
     }
   }
-  FeatureBits.toLLVMFeatureList(Ret.Features);
   return Ret;
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 696553ef8038a8..12fb50286f7511 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -107,6 +107,10 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   unsigned multiVersionSortPriority(StringRef Name) const override;
   unsigned multiVersionFeatureCost() const override;
 
+  bool
+  initFeatureMap(llvm::StringMap<bool> &Features, DiagnosticsEngine &Diags,
+                 StringRef CPU,
+                 const std::vector<std::string> &FeaturesVec) const override;
   bool useFP16ConversionIntrinsics() const override {
     return false;
   }
diff --git a/clang/lib/CodeGen/CGBlocks.cpp b/clang/lib/CodeGen/CGBlocks.cpp
index bf50f2025de573..5dac1cd425bf61 100644
--- a/clang/lib/CodeGen/CGBlocks.cpp
+++ b/clang/lib/CodeGen/CGBlocks.cpp
@@ -577,7 +577,7 @@ static void computeBlockInfo(CodeGenModule &CGM, CodeGenFunction *CGF,
 
   // First, 'this'.
   if (block->capturesCXXThis()) {
-    assert(CGF && CGF->CurFuncDecl && isa<CXXMethodDecl>(CGF->CurFuncDecl) &&
+    assert(CGF && isa_and_nonnull<CXXMethodDecl>(CGF->CurFuncDecl) &&
            "Can't capture 'this' outside a method");
     QualType thisType = cast<CXXMethodDecl>(CGF->CurFuncDecl)->getThisType();
 
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 62e6abe576c31d..8fbbc6632a4323 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2923,6 +2923,18 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
       SetSqrtFPAccuracy(Call);
       return RValue::get(Call);
     }
+
+    case Builtin::BItan:
+    case Builtin::BItanf:
+    case Builtin::BItanl:
+    case Builtin::BI__builtin_tan:
+    case Builtin::BI__builtin_tanf:
+    case Builtin::BI__builtin_tanf16:
+    case Builtin::BI__builtin_tanl:
+    case Builtin::BI__builtin_tanf128:
+      return RValue::get(emitUnaryMaybeConstrainedFPBuiltin(
+          *this, E, Intrinsic::tan, Intrinsic::experimental_constrained_tan));
+
     case Builtin::BItrunc:
     case Builtin::BItruncf:
     case Builtin::BItruncl:
diff --git a/clang/lib/CodeGen/CGClass.cpp b/clang/lib/CodeGen/CGClass.cpp
index 4c57dfb3290026..ebf02033a5d356 100644
--- a/clang/lib/CodeGen/CGClass.cpp
+++ b/clang/lib/CodeGen/CGClass.cpp
@@ -859,7 +859,7 @@ void CodeGenFunction::EmitConstructorBody(FunctionArgList &Args) {
 
   // Enter the function-try-block before the constructor prologue if
   // applicable.
-  bool IsTryBody = (Body && isa<CXXTryStmt>(Body));
+  bool IsTryBody = isa_and_nonnull<CXXTryStmt>(Body);
   if (IsTryBody)
     EnterCXXTryStmt(*cast<CXXTryStmt>(Body), true);
 
@@ -1475,7 +1475,7 @@ void CodeGenFunction::EmitDestructorBody(FunctionArgList &Args) {
 
   // If the body is a function-try-block, enter the try before
   // anything else.
-  bool isTryBody = (Body && isa<CXXTryStmt>(Body));
+  bool isTryBody = isa_and_nonnull<CXXTryStmt>(Body);
   if (isTryBody)
     EnterCXXTryStmt(*cast<CXXTryStmt>(Body), true);
   EmitAsanPrologueOrEpilogue(false);
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index b047279912f6b7..a88bb2af59fee0 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -476,6 +476,10 @@ llvm::Function *CodeGenModule::CreateGlobalInitOrCleanUpFunction(
       !isInNoSanitizeList(SanitizerKind::Thread, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeThread);
 
+  if (getLangOpts().Sanitize.has(SanitizerKind::NumericalStability) &&
+      !isInNoSanitizeList(SanitizerKind::NumericalStability, Fn, Loc))
+    Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability);
+
   if (getLangOpts().Sanitize.has(SanitizerKind::Memory) &&
       !isInNoSanitizeList(SanitizerKind::Memory, Fn, Loc))
     Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
diff --git a/clang/lib/CodeGen/CGExprConstant.cpp b/clang/lib/CodeGen/CGExprConstant.cpp
index 4eb65b34a89f56..0712f40fd8215a 100644
--- a/clang/lib/CodeGen/CGExprConstant.cpp
+++ b/clang/lib/CodeGen/CGExprConstant.cpp
@@ -715,7 +715,7 @@ bool ConstStructBuilder::Build(const InitListExpr *ILE, bool AllowOverwrite) {
     const Expr *Init = nullptr;
     if (ElementNo < ILE->getNumInits())
       Init = ILE->getInit(ElementNo++);
-    if (Init && isa<NoInitExpr>(Init))
+    if (isa_and_nonnull<NoInitExpr>(Init))
       continue;
 
     // Zero-sized fields are not emitted, but their initializers may still
diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp
index c7be7c0398411d..f3a398bde1af39 100644
--- a/clang/lib/CodeGen/CGStmtOpenMP.cpp
+++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp
@@ -72,7 +72,7 @@ class OMPLexicalScope : public CodeGenFunction::LexicalScope {
   static bool isCapturedVar(CodeGenFunction &CGF, const VarDecl *VD) {
     return CGF.LambdaCaptureFields.lookup(VD) ||
            (CGF.CapturedStmtInfo && CGF.CapturedStmtInfo->lookup(VD)) ||
-           (CGF.CurCodeDecl && isa<BlockDecl>(CGF.CurCodeDecl) &&
+           (isa_and_nonnull<BlockDecl>(CGF.CurCodeDecl) &&
             cast<BlockDecl>(CGF.CurCodeDecl)->capturesVariable(VD));
   }
 
@@ -227,7 +227,7 @@ class OMPSimdLexicalScope : public CodeGenFunction::LexicalScope {
   static bool isCapturedVar(CodeGenFunction &CGF, const VarDecl *VD) {
     return CGF.LambdaCaptureFields.lookup(VD) ||
            (CGF.CapturedStmtInfo && CGF.CapturedStmtInfo->lookup(VD)) ||
-           (CGF.CurCodeDecl && isa<BlockDecl>(CGF.CurCodeDecl) &&
+           (isa_and_nonnull<BlockDecl>(CGF.CurCodeDecl) &&
             cast<BlockDecl>(CGF.CurCodeDecl)->capturesVariable(VD));
   }
 
@@ -315,7 +315,7 @@ LValue CodeGenFunction::EmitOMPSharedLValue(const Expr *E) {
       bool IsCaptured =
           LambdaCaptureFields.lookup(OrigVD) ||
           (CapturedStmtInfo && CapturedStmtInfo->lookup(OrigVD)) ||
-          (CurCodeDecl && isa<BlockDecl>(CurCodeDecl));
+          (isa_and_nonnull<BlockDecl>(CurCodeDecl));
       DeclRefExpr DRE(getContext(), const_cast<VarDecl *>(OrigVD), IsCaptured,
                       OrigDRE->getType(), VK_LValue, OrigDRE->getExprLoc());
       return EmitLValue(&DRE);
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index f0345f3b191b88..cea0d84c64bc47 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -818,6 +818,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
       Fn->addFnAttr(llvm::Attribute::SanitizeMemTag);
     if (SanOpts.has(SanitizerKind::Thread))
       Fn->addFnAttr(llvm::Attribute::SanitizeThread);
+    if (SanOpts.has(SanitizerKind::NumericalStability))
+      Fn->addFnAttr(llvm::Attribute::SanitizeNumericalStability);
     if (SanOpts.hasOneOf(SanitizerKind::Memory | SanitizerKind::KernelMemory))
       Fn->addFnAttr(llvm::Attribute::SanitizeMemory);
   }
@@ -2951,7 +2953,7 @@ void CodeGenFunction::emitAlignmentAssumptionCheck(
     SourceLocation SecondaryLoc, llvm::Value *Alignment,
     llvm::Value *OffsetValue, llvm::Value *TheCheck,
     llvm::Instruction *Assumption) {
-  assert(Assumption && isa<llvm::CallInst>(Assumption) &&
+  assert(isa_and_nonnull<llvm::CallInst>(Assumption) &&
          cast<llvm::CallInst>(Assumption)->getCalledOperand() ==
              llvm::Intrinsic::getDeclaration(
                  Builder.GetInsertBlock()->getParent()->getParent(),
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 273f215ca94a88..86825a6ccf7a1d 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -41,7 +41,8 @@ static const SanitizerMask NotAllowedWithExecuteOnly =
     SanitizerKind::Function | SanitizerKind::KCFI;
 static const SanitizerMask NeedsUnwindTables =
     SanitizerKind::Address | SanitizerKind::HWAddress | SanitizerKind::Thread |
-    SanitizerKind::Memory | SanitizerKind::DataFlow;
+    SanitizerKind::Memory | SanitizerKind::DataFlow |
+    SanitizerKind::NumericalStability;
 static const SanitizerMask SupportsCoverage =
     SanitizerKind::Address | SanitizerKind::HWAddress |
     SanitizerKind::KernelAddress | SanitizerKind::KernelHWAddress |
@@ -53,7 +54,8 @@ static const SanitizerMask SupportsCoverage =
     SanitizerKind::DataFlow | SanitizerKind::Fuzzer |
     SanitizerKind::FuzzerNoLink | SanitizerKind::FloatDivideByZero |
     SanitizerKind::SafeStack | SanitizerKind::ShadowCallStack |
-    SanitizerKind::Thread | SanitizerKind::ObjCCast | SanitizerKind::KCFI;
+    SanitizerKind::Thread | SanitizerKind::ObjCCast | SanitizerKind::KCFI |
+    SanitizerKind::NumericalStability;
 static const SanitizerMask RecoverableByDefault =
     SanitizerKind::Undefined | SanitizerKind::Integer |
     SanitizerKind::ImplicitConversion | SanitizerKind::Nullability |
@@ -175,6 +177,7 @@ static void addDefaultIgnorelists(const Driver &D, SanitizerMask Kinds,
                      {"hwasan_ignorelist.txt", SanitizerKind::HWAddress},
                      {"memtag_ignorelist.txt", SanitizerKind::MemTag},
                      {"msan_ignorelist.txt", SanitizerKind::Memory},
+                     {"nsan_ignorelist.txt", SanitizerKind::NumericalStability},
                      {"tsan_ignorelist.txt", SanitizerKind::Thread},
                      {"dfsan_abilist.txt", SanitizerKind::DataFlow},
                      {"cfi_ignorelist.txt", SanitizerKind::CFI},
diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 593b403a1e3f05..ed5737915aa96b 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -3448,6 +3448,7 @@ SanitizerMask Darwin::getSupportedSanitizers() const {
   Res |= SanitizerKind::PointerCompare;
   Res |= SanitizerKind::PointerSubtract;
   Res |= SanitizerKind::Leak;
+  Res |= SanitizerKind::NumericalStability;
   Res |= SanitizerKind::Fuzzer;
   Res |= SanitizerKind::FuzzerNoLink;
   Res |= SanitizerKind::ObjCCast;
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index 70bb5720e1a7a1..594e3185ab8c63 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -1060,6 +1060,9 @@ SanitizerMask Linux::getSupportedSanitizers() const {
   if (IsX86_64 || IsAArch64) {
     Res |= SanitizerKind::KernelHWAddress;
   }
+  if (IsX86_64 || IsAArch64)
+    Res |= SanitizerKind::NumericalStability;
+
   // Work around "Cannot represent a difference across sections".
   if (getTriple().getArch() == llvm::Triple::ppc64)
     Res &= ~SanitizerKind::Function;
diff --git a/clang/lib/Index/IndexBody.cpp b/clang/lib/Index/IndexBody.cpp
index 08136baa5d408e..c18daf7faa7497 100644
--- a/clang/lib/Index/IndexBody.cpp
+++ b/clang/lib/Index/IndexBody.cpp
@@ -268,7 +268,7 @@ class BodyIndexer : public RecursiveASTVisitor<BodyIndexer> {
         }
         return true;
       };
-      bool IsPropCall = Containing && isa<PseudoObjectExpr>(Containing);
+      bool IsPropCall = isa_and_nonnull<PseudoObjectExpr>(Containing);
       // Implicit property message sends are not 'implicit'.
       if ((E->isImplicit() || IsPropCall) &&
           !(IsPropCall &&
diff --git a/clang/lib/Lex/PPMacroExpansion.cpp b/clang/lib/Lex/PPMacroExpansion.cpp
index 8af4a97d00cb82..f085b943716442 100644
--- a/clang/lib/Lex/PPMacroExpansion.cpp
+++ b/clang/lib/Lex/PPMacroExpansion.cpp
@@ -226,7 +226,7 @@ void Preprocessor::updateModuleMacroInfo(const IdentifierInfo *II,
   bool IsSystemMacro = true;
   bool IsAmbiguous = false;
   if (auto *MD = Info.MD) {
-    while (MD && isa<VisibilityMacroDirective>(MD))
+    while (isa_and_nonnull<VisibilityMacroDirective>(MD))
       MD = MD->getPrevious();
     if (auto *DMD = dyn_cast_or_null<DefMacroDirective>(MD)) {
       MI = DMD->getInfo();
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index b9d0b59ef1db73..0f604c61fa3af9 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -442,7 +442,7 @@ static ControlFlowKind CheckFallThrough(AnalysisDeclContext &AC) {
       if (!live[B->getBlockID()]) {
         if (B->pred_begin() == B->pred_end()) {
           const Stmt *Term = B->getTerminatorStmt();
-          if (Term && isa<CXXTryStmt>(Term))
+          if (isa_and_nonnull<CXXTryStmt>(Term))
             // When not adding EH edges from calls, catch clauses
             // can otherwise seem dead.  Avoid noting them as dead.
             count += reachable_code::ScanReachableFromBlock(B, live);
@@ -1100,7 +1100,7 @@ namespace {
       // issue a warn_fallthrough_attr_unreachable for them.
       for (const auto *B : *Cfg) {
         const Stmt *L = B->getLabel();
-        if (L && isa<SwitchCase>(L) && ReachableBlocks.insert(B).second)
+        if (isa_and_nonnull<SwitchCase>(L) && ReachableBlocks.insert(B).second)
           BlockQueue.push_back(B);
       }
 
@@ -1128,7 +1128,7 @@ namespace {
         if (!P) continue;
 
         const Stmt *Term = P->getTerminatorStmt();
-        if (Term && isa<SwitchStmt>(Term))
+        if (isa_and_nonnull<SwitchStmt>(Term))
           continue; // Switch statement, good.
 
         const SwitchCase *SW = dyn_cast_or_null<SwitchCase>(P->getLabel());
@@ -1327,7 +1327,7 @@ static void DiagnoseSwitchLabelsFallthrough(Sema &S, AnalysisDeclContext &AC,
         B = *B->succ_begin();
         Term = B->getTerminatorStmt();
       }
-      if (!(B->empty() && Term && isa<BreakStmt>(Term))) {
+      if (!(B->empty() && isa_and_nonnull<BreakStmt>(Term))) {
         Preprocessor &PP = S.getPreprocessor();
         StringRef AnnotationSpelling = getFallthroughAttrSpelling(PP, L);
         SmallString<64> TextToInsert(AnnotationSpelling);
diff --git a/clang/lib/Sema/SemaCXXScopeSpec.cpp b/clang/lib/Sema/SemaCXXScopeSpec.cpp
index fca5bd131bbc0f..74cc0f7586ef41 100644
--- a/clang/lib/Sema/SemaCXXScopeSpec.cpp
+++ b/clang/lib/Sema/SemaCXXScopeSpec.cpp
@@ -966,7 +966,7 @@ bool Sema::ActOnCXXNestedNameSpecifier(Scope *S,
       R.setBegin(SS.getRange().getBegin());
 
     Diag(CCLoc, diag::err_non_type_template_in_nested_name_specifier)
-      << (TD && isa<VarTemplateDecl>(TD)) << Template << R;
+        << isa_and_nonnull<VarTemplateDecl>(TD) << Template << R;
     NoteAllFoundTemplates(Template);
     return true;
   }
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 300af02239779f..07cd0727eb3f4a 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3839,11 +3839,11 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
   if (CallType != VariadicDoesNotApply &&
       (!FD || FD->getBuiltinID() != Builtin::BI__noop)) {
     unsigned NumParams = Proto ? Proto->getNumParams()
-                       : FDecl && isa<FunctionDecl>(FDecl)
-                           ? cast<FunctionDecl>(FDecl)->getNumParams()
-                       : FDecl && isa<ObjCMethodDecl>(FDecl)
-                           ? cast<ObjCMethodDecl>(FDecl)->param_size()
-                       : 0;
+                         : isa_and_nonnull<FunctionDecl>(FDecl)
+                             ? cast<FunctionDecl>(FDecl)->getNumParams()
+                         : isa_and_nonnull<ObjCMethodDecl>(FDecl)
+                             ? cast<ObjCMethodDecl>(FDecl)->param_size()
+                             : 0;
 
     for (unsigned ArgIdx = NumParams; ArgIdx < Args.size(); ++ArgIdx) {
       // Args[ArgIdx] can be null in malformed code.
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 8cb453e05a91ba..01270dc65da960 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -2074,7 +2074,7 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   if (DirectInitRange.isValid()) {
     assert(Initializer && "Have parens but no initializer.");
     InitStyle = CXXNewInitializationStyle::Parens;
-  } else if (Initializer && isa<InitListExpr>(Initializer))
+  } else if (isa_and_nonnull<InitListExpr>(Initializer))
     InitStyle = CXXNewInitializationStyle::Braces;
   else {
     assert((!Initializer || isa<ImplicitValueInitExpr>(Initializer) ||
@@ -3823,7 +3823,7 @@ Sema::ActOnCXXDelete(SourceLocation StartLoc, bool UseGlobal,
 
         // Otherwise, the usual operator delete[] should be the
         // function we just found.
-        else if (OperatorDelete && isa<CXXMethodDecl>(OperatorDelete))
+        else if (isa_and_nonnull<CXXMethodDecl>(OperatorDelete))
           UsualArrayDeleteWantsSize =
             UsualDeallocFnInfo(*this,
                                DeclAccessPair::make(OperatorDelete, AS_public))
@@ -8595,7 +8595,7 @@ static void CheckIfAnyEnclosingLambdasMustCaptureAnyPotentialCaptures(
   assert(S.CurContext->isDependentContext());
 #ifndef NDEBUG
   DeclContext *DC = S.CurContext;
-  while (DC && isa<CapturedDecl>(DC))
+  while (isa_and_nonnull<CapturedDecl>(DC))
     DC = DC->getParent();
   assert(
       CurrentLSI->CallOperator == DC &&
@@ -9172,7 +9172,7 @@ ExprResult Sema::ActOnFinishFullExpr(Expr *FE, SourceLocation CC,
   //  - Teach the handful of places that iterate over FunctionScopes to
   //    stop at the outermost enclosing lexical scope."
   DeclContext *DC = CurContext;
-  while (DC && isa<CapturedDecl>(DC))
+  while (isa_and_nonnull<CapturedDecl>(DC))
     DC = DC->getParent();
   const bool IsInLambdaDeclContext = isLambdaCallOperator(DC);
   if (IsInLambdaDeclContext && CurrentLSI &&
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index ed8b226a6b39f5..7244f3ef4e829e 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -2194,7 +2194,7 @@ void InitListChecker::CheckStructUnionTypes(
 
     // Designated inits always initialize fields, so if we see one, all
     // remaining base classes have no explicit initializer.
-    if (Init && isa<DesignatedInitExpr>(Init))
+    if (isa_and_nonnull<DesignatedInitExpr>(Init))
       Init = nullptr;
 
     // C++ [over.match.class.deduct]p1.6:
@@ -6350,7 +6350,7 @@ void InitializationSequence::InitializeFrom(Sema &S,
     // class member of array type from a parenthesized initializer list.
     else if (S.getLangOpts().CPlusPlus &&
              Entity.getKind() == InitializedEntity::EK_Member &&
-             Initializer && isa<InitListExpr>(Initializer)) {
+             isa_and_nonnull<InitListExpr>(Initializer)) {
       TryListInitialization(S, Entity, Kind, cast<InitListExpr>(Initializer),
                             *this, TreatUnavailableAsInvalid);
       AddParenthesizedArrayInitStep(DestType);
@@ -8793,7 +8793,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
   // constant expressions here in order to perform narrowing checks =(
   EnterExpressionEvaluationContext Evaluated(
       S, EnterExpressionEvaluationContext::InitList,
-      CurInit.get() && isa<InitListExpr>(CurInit.get()));
+      isa_and_nonnull<InitListExpr>(CurInit.get()));
 
   // C++ [class.abstract]p2:
   //   no objects of an abstract class can be created except as subobjects
diff --git a/clang/lib/Sema/SemaStmt.cpp b/clang/lib/Sema/SemaStmt.cpp
index 39a3a350fdcc2a..afc61e3c61f9e2 100644
--- a/clang/lib/Sema/SemaStmt.cpp
+++ b/clang/lib/Sema/SemaStmt.cpp
@@ -3704,7 +3704,7 @@ bool Sema::DeduceFunctionTypeFromReturnExpr(FunctionDecl *FD,
   if (isLambdaConversionOperator(FD))
     return false;
 
-  if (RetExpr && isa<InitListExpr>(RetExpr)) {
+  if (isa_and_nonnull<InitListExpr>(RetExpr)) {
     //  If the deduction is for a return statement and the initializer is
     //  a braced-init-list, the program is ill-formed.
     Diag(RetExpr->getExprLoc(),
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index d1eec1a3707514..76b22d3886fa56 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -1962,7 +1962,7 @@ DeclResult Sema::CheckClassTemplate(
   // We may have found the injected-class-name of a class template,
   // class template partial specialization, or class template specialization.
   // In these cases, grab the template that is being defined or specialized.
-  if (!PrevClassTemplate && PrevDecl && isa<CXXRecordDecl>(PrevDecl) &&
+  if (!PrevClassTemplate && isa_and_nonnull<CXXRecordDecl>(PrevDecl) &&
       cast<CXXRecordDecl>(PrevDecl)->isInjectedClassName()) {
     PrevDecl = cast<CXXRecordDecl>(PrevDecl->getDeclContext());
     PrevClassTemplate
diff --git a/clang/test/AST/Interp/cxx23.cpp b/clang/test/AST/Interp/cxx23.cpp
index 1efd784abbbe8f..d0991f3ffdff5e 100644
--- a/clang/test/AST/Interp/cxx23.cpp
+++ b/clang/test/AST/Interp/cxx23.cpp
@@ -1,6 +1,6 @@
 // UNSUPPORTED:  target={{.*}}-zos{{.*}}
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref20,all,all20 %s
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref23,all %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=ref,ref20,all,all20 %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=ref,ref23,all %s
 // RUN: %clang_cc1 -std=c++20 -fsyntax-only -fcxx-exceptions -verify=expected20,all,all20 %s -fexperimental-new-constant-interpreter
 // RUN: %clang_cc1 -std=c++23 -fsyntax-only -fcxx-exceptions -verify=expected23,all %s -fexperimental-new-constant-interpreter
 
@@ -200,3 +200,15 @@ namespace UndefinedThreeWay {
   static_assert(!(*test_a_threeway)(A(), A())); // all-error {{static assertion expression is not an integral constant expression}} \
                                                 // all-note {{undefined function 'operator<=>' cannot be used in a constant expression}}
 }
+
+/// FIXME: The new interpreter is missing the "initializer of q is not a constant expression" diagnostics.a
+/// That's because the cast from void* to int* is considered fine, but diagnosed. So we don't consider
+/// q to be uninitialized.
+namespace VoidCast {
+  constexpr void* p = nullptr;
+  constexpr int* q = static_cast<int*>(p); // all-error {{must be initialized by a constant expression}} \
+                                           // all-note {{cast from 'void *' is not allowed in a constant expression}} \
+                                           // ref-note {{declared here}}
+  static_assert(q == nullptr); // ref-error {{not an integral constant expression}} \
+                               // ref-note {{initializer of 'q' is not a constant expression}}
+}
diff --git a/clang/test/AST/Interp/cxx26.cpp b/clang/test/AST/Interp/cxx26.cpp
new file mode 100644
index 00000000000000..0b0e2b21e8201e
--- /dev/null
+++ b/clang/test/AST/Interp/cxx26.cpp
@@ -0,0 +1,10 @@
+// RUN: %clang_cc1 -std=c++26 -fsyntax-only -fcxx-exceptions -verify=ref,both %s
+// RUN: %clang_cc1 -std=c++26 -fsyntax-only -fcxx-exceptions -verify=expected,both %s -fexperimental-new-constant-interpreter
+
+// both-no-diagnostics
+
+namespace VoidCast {
+  constexpr void* p = nullptr;
+  constexpr int* q = static_cast<int*>(p);
+  static_assert(q == nullptr);
+}
diff --git a/clang/test/CodeGen/X86/math-builtins.c b/clang/test/CodeGen/X86/math-builtins.c
index 093239b4482609..1e0f129b986102 100644
--- a/clang/test/CodeGen/X86/math-builtins.c
+++ b/clang/test/CodeGen/X86/math-builtins.c
@@ -674,10 +674,10 @@ __builtin_sqrt(f);       __builtin_sqrtf(f);      __builtin_sqrtl(f); __builtin_
 
 __builtin_tan(f);        __builtin_tanf(f);       __builtin_tanl(f); __builtin_tanf128(f);
 
-// NO__ERRNO: declare double @tan(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]]
-// NO__ERRNO: declare fp128 @tanf128(fp128 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare fp128 @llvm.tan.f128(fp128) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
diff --git a/clang/test/CodeGen/aarch64-cpu-supports-target.c b/clang/test/CodeGen/aarch64-cpu-supports-target.c
index 28187bcf745331..e023944b24e53a 100644
--- a/clang/test/CodeGen/aarch64-cpu-supports-target.c
+++ b/clang/test/CodeGen/aarch64-cpu-supports-target.c
@@ -48,5 +48,5 @@ int test_versions() {
     return code();
 }
 // CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
-// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+sve" }
+// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon" }
+// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
index 9885ac45e6a0e0..af8933d93d6cbb 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/aarch64-sme-attrs.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +bf16 \
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme \
 // RUN:   -disable-O0-optnone -Werror -emit-llvm -o - %s \
 // RUN: | opt -S -passes=mem2reg \
 // RUN: | opt -S -passes=inline \
diff --git a/clang/test/CodeGen/aarch64-targetattr.c b/clang/test/CodeGen/aarch64-targetattr.c
index 644e6a692c3be9..3e7a2092456071 100644
--- a/clang/test/CodeGen/aarch64-targetattr.c
+++ b/clang/test/CodeGen/aarch64-targetattr.c
@@ -58,50 +58,58 @@ void v1msve() {}
 // CHECK-LABEL: @plussve() #12
 __attribute__((target("+sve")))
 void plussve() {}
-// CHECK-LABEL: @plussveplussve2() #12
+// CHECK-LABEL: @plussveplussve2() #13
 __attribute__((target("+sve+nosve2")))
 void plussveplussve2() {}
-// CHECK-LABEL: @plussveminusnosve2() #12
+// CHECK-LABEL: @plussveminusnosve2() #13
 __attribute__((target("sve,no-sve2")))
 void plussveminusnosve2() {}
-// CHECK-LABEL: @plusfp16() #13
+// CHECK-LABEL: @plusfp16() #14
 __attribute__((target("+fp16")))
 void plusfp16() {}
 
-// CHECK-LABEL: @all() #14
+// CHECK-LABEL: @all() #15
 __attribute__((target("cpu=neoverse-n1,tune=cortex-a710,arch=armv8.6-a+sve2")))
 void all() {}
-// CHECK-LABEL: @allplusbranchprotection() #15
+// CHECK-LABEL: @allplusbranchprotection() #16
 __attribute__((target("cpu=neoverse-n1,tune=cortex-a710,arch=armv8.6-a+sve2,branch-protection=standard")))
 void allplusbranchprotection() {}
 
-// CHECK-LABEL: @plusnosimd() #16
+// These tests check that the user facing and internal llvm name are both accepted.
+// CHECK-LABEL: @plusnoneon() #17
+__attribute__((target("+noneon")))
+void plusnoneon() {}
+// CHECK-LABEL: @plusnosimd() #17
 __attribute__((target("+nosimd")))
 void plusnosimd() {}
-// CHECK-LABEL: @nosimd() #16
+// CHECK-LABEL: @noneon() #17
+__attribute__((target("no-neon")))
+void noneon() {}
+// CHECK-LABEL: @nosimd() #17
 __attribute__((target("no-simd")))
 void nosimd() {}
 
 // This isn't part of the standard interface, but test that -arch features should not apply anything else.
-// CHECK-LABEL: @minusarch() #17
+// CHECK-LABEL: @minusarch() #18
 __attribute__((target("no-v9.3a")))
 void minusarch() {}
 
 // CHECK: attributes #0 = { {{.*}} "target-features"="+crc,+fp-armv8,+lse,+neon,+ras,+rdm,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #1 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+v8.1a,+v8.2a,+v8a" }
 // CHECK: attributes #2 = { {{.*}} "target-features"="+crc,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8a" }
-// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
-// CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,+v9a" }
+// CHECK: attributes #3 = { {{.*}} "target-features"="+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" }
+// CHECK: attributes #4 = { {{.*}} "target-cpu"="cortex-a710" "target-features"="+bf16,+complxnum,+crc,+dotprod,+flagm,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+mte,+neon,+pauth,+ras,+rcpc,+rdm,+sb,+sve,+sve2,+sve2-bitperm" }
 // CHECK: attributes #5 = { {{.*}} "tune-cpu"="cortex-a710" }
 // CHECK: attributes #6 = { {{.*}} "target-cpu"="generic" }
 // CHECK: attributes #7 = { {{.*}} "tune-cpu"="generic" }
-// CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+v8.1a,+v8.2a,+v8a" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+sve" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a" }
-// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8a,-sve" }
-// CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+sve" }
-// CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16" }
-// CHECK: attributes #14 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
-// CHECK: attributes #15 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
-// CHECK-NOT: attributes #16 = {{.*}} "target-features"
-// CHECK: attributes #17 = { {{.*}} "target-features"="-v9.3a" }
+// CHECK: attributes #8 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+crc,+dotprod,+fp-armv8,+fullfp16,+lse,+neon,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #9 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #10 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,+sve,+sve2" }
+// CHECK: attributes #11 = { {{.*}} "target-cpu"="neoverse-v1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fp16fml,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+rand,+ras,+rcpc,+rdm,+sha2,+sha3,+sm4,+spe,+ssbs,-sve" }
+// CHECK: attributes #12 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve" }
+// CHECK: attributes #13 = { {{.*}} "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-sve2" }
+// CHECK: attributes #14 = { {{.*}} "target-features"="+fullfp16" }
+// CHECK: attributes #15 = { {{.*}} "target-cpu"="neoverse-n1" "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #16 = { {{.*}} "branch-target-enforcement"="true" "guarded-control-stack"="true" {{.*}} "target-features"="+aes,+bf16,+complxnum,+crc,+dotprod,+fp-armv8,+fullfp16,+i8mm,+jsconv,+lse,+neon,+pauth,+ras,+rcpc,+rdm,+sha2,+spe,+ssbs,+sve,+sve2,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8.6a,+v8a" "tune-cpu"="cortex-a710" }
+// CHECK: attributes #17 = { {{.*}} "target-features"="-neon" }
+// CHECK: attributes #18 = { {{.*}} "target-features"="-v9.3a" }
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 75f8734e5aaf37..3597711333d341 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -1129,42 +1129,42 @@ int caller(void) { return used_def_without_default_decl() + used_decl_without_de
 // CHECK-NOFMV-NEXT:    ret i32 0
 //
 //.
-// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+neon,+rand,-v9.5a" }
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp16fml,+fullfp16,+neon,+rand,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+sme,+sme-i16i64,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+sha2,-v9.5a" }
-// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+ls64,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fp16fml,+fullfp16,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+sha2,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR3]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+ls64,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp16fml,+fullfp16,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme2,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR9:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+sb,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
+// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" }
-// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rdm,-v9.5a" }
-// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+jsconv,+neon,-v9.5a" }
-// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+neon,+rdm,+sme,-v9.5a" }
-// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
+// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR19:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR20:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+rdm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+jsconv,+neon,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fullfp16,+neon,+rdm,+sme,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fullfp16,+i8mm,+neon,+sha2,+sha3,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR25]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR26]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+jsconv,+neon,-v9.5a" }
+// CHECK: attributes #[[ATTR27]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+jsconv,+neon,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR28]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+rcpc,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+neon,+sve,-v9.5a" }
-// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-v9.5a" }
-// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-v9.5a" }
-// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-v9.5a" }
+// CHECK: attributes #[[ATTR29]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+neon,+sve,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR30]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR31]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR32]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+mte,+neon,+sve,+sve2,+sve2-sm4,-fp-armv8,-v9.5a" }
 // CHECK: attributes #[[ATTR33]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+mops,+mte,+rcpc,+rcpc3,-fp-armv8,-v9.5a" }
-// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sm4,-v9.5a" }
-// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon,+rdm,-v9.5a" }
+// CHECK: attributes #[[ATTR34]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+neon,+sm4,-fp-armv8,-v9.5a" }
+// CHECK: attributes #[[ATTR35]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+lse,+neon,+rdm,-fp-armv8,-v9.5a" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
 // CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
diff --git a/clang/test/CodeGen/constrained-math-builtins.c b/clang/test/CodeGen/constrained-math-builtins.c
index 2de832dd2b6cae..6cc3a10a1e7946 100644
--- a/clang/test/CodeGen/constrained-math-builtins.c
+++ b/clang/test/CodeGen/constrained-math-builtins.c
@@ -183,6 +183,14 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: call x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 // CHECK: call fp128 @llvm.experimental.constrained.sqrt.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
 
+  __builtin_tan(f);        __builtin_tanf(f);       __builtin_tanl(f); __builtin_tanf128(f);
+
+// CHECK: call double @llvm.experimental.constrained.tan.f64(double %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call float @llvm.experimental.constrained.tan.f32(float %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+// CHECK: call fp128 @llvm.experimental.constrained.tan.f128(fp128 %{{.*}}, metadata !"round.tonearest", metadata !"fpexcept.strict")
+
+
   __builtin_trunc(f);      __builtin_truncf(f);     __builtin_truncl(f); __builtin_truncf128(f);
 
 // CHECK: call double @llvm.experimental.constrained.trunc.f64(double %{{.*}}, metadata !"fpexcept.strict")
@@ -315,6 +323,11 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c, _
 // CHECK: declare x86_fp80 @llvm.experimental.constrained.sqrt.f80(x86_fp80, metadata, metadata)
 // CHECK: declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
 
+// CHECK: declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+// CHECK: declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+// CHECK: declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata)
+// CHECK: declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
+
 // CHECK: declare double @llvm.experimental.constrained.trunc.f64(double, metadata)
 // CHECK: declare float @llvm.experimental.constrained.trunc.f32(float, metadata)
 // CHECK: declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
diff --git a/clang/test/CodeGen/instrument-objc-method.m b/clang/test/CodeGen/instrument-objc-method.m
index 2c9d1fc88554bd..cfc0a0a98bec6b 100644
--- a/clang/test/CodeGen/instrument-objc-method.m
+++ b/clang/test/CodeGen/instrument-objc-method.m
@@ -11,16 +11,16 @@ @implementation ObjCClass
 + (void)initialize {
 }
 
-// BARE: @"\01+[ObjCClass load]"{{\(.*\)}} #1
+// PREINLINE: declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+// BARE: @"\01+[ObjCClass load]"{{\(.*\)}} #2
 + (void)load __attribute__((no_instrument_function)) {
 }
 
-// PREINLINE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #1
-// BARE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #1
+// PREINLINE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #2
+// BARE: @"\01-[ObjCClass dealloc]"{{\(.*\)}} #2
 - (void)dealloc __attribute__((no_instrument_function)) {
 }
 
-// PREINLINE: declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 // PREINLINE: attributes #0 = { {{.*}}"instrument-function-entry"="__cyg_profile_func_enter"
 // PREINLINE-NOT: attributes #0 = { {{.*}}"instrument-function-entry"="__cyg_profile_func_enter_bare"
 // PREINLINE-NOT: attributes #2 = { {{.*}}"__cyg_profile_func_enter"
diff --git a/clang/test/CodeGen/math-libcalls.c b/clang/test/CodeGen/math-libcalls.c
index 29c312ba0ecac2..a249182692762d 100644
--- a/clang/test/CodeGen/math-libcalls.c
+++ b/clang/test/CodeGen/math-libcalls.c
@@ -662,15 +662,15 @@ void foo(double *d, float f, float *fp, long double *l, int *i, const char *c) {
 
   tan(f);        tanf(f);       tanl(f);
 
-// NO__ERRNO: declare double @tan(double noundef) [[READNONE]]
-// NO__ERRNO: declare float @tanf(float noundef) [[READNONE]]
-// NO__ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[READNONE]]
+// NO__ERRNO: declare double @llvm.tan.f64(double) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare float @llvm.tan.f32(float) [[READNONE_INTRINSIC]]
+// NO__ERRNO: declare x86_fp80 @llvm.tan.f80(x86_fp80) [[READNONE_INTRINSIC]]
 // HAS_ERRNO: declare double @tan(double noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare float @tanf(float noundef) [[NOT_READNONE]]
 // HAS_ERRNO: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare double @tan(double noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare float @tanf(float noundef) [[NOT_READNONE]]
-// HAS_MAYTRAP: declare x86_fp80 @tanl(x86_fp80 noundef) [[NOT_READNONE]]
+// HAS_MAYTRAP: declare double @llvm.experimental.constrained.tan.f64(
+// HAS_MAYTRAP: declare float @llvm.experimental.constrained.tan.f32(
+// HAS_MAYTRAP: declare x86_fp80 @llvm.experimental.constrained.tan.f80(
 
   tanh(f);       tanhf(f);      tanhl(f);
 
diff --git a/clang/test/CodeGen/sanitize-numerical-stability-attr.cpp b/clang/test/CodeGen/sanitize-numerical-stability-attr.cpp
new file mode 100644
index 00000000000000..f51fb79bda6afd
--- /dev/null
+++ b/clang/test/CodeGen/sanitize-numerical-stability-attr.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s | FileCheck -check-prefix=WITHOUT %s
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s -fsanitize=numerical | FileCheck -check-prefix=NSAN %s
+// RUN: echo "src:%s" | sed -e 's/\\/\\\\/g' > %t
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -emit-llvm -o - %s -fsanitize=numerical -fsanitize-ignorelist=%t | FileCheck -check-prefix=BL %s
+
+// WITHOUT:  NoNSAN3{{.*}}) [[NOATTR:#[0-9]+]]
+// BL:  NoNSAN3{{.*}}) [[NOATTR:#[0-9]+]]
+// NSAN:  NoNSAN3{{.*}}) [[NOATTR:#[0-9]+]]
+__attribute__((no_sanitize("numerical")))
+int NoNSAN3(int *a) { return *a; }
+
+// WITHOUT:  NSANOk{{.*}}) [[NOATTR]]
+// BL:  NSANOk{{.*}}) [[NOATTR]]
+// NSAN: NSANOk{{.*}}) [[WITH:#[0-9]+]]
+int NSANOk(int *a) { return *a; }
+
+// WITHOUT:  TemplateNSANOk{{.*}}) [[NOATTR]]
+// BL:  TemplateNSANOk{{.*}}) [[NOATTR]]
+// NSAN: TemplateNSANOk{{.*}}) [[WITH]]
+template<int i>
+int TemplateNSANOk() { return i; }
+
+// WITHOUT:  TemplateNoNSAN{{.*}}) [[NOATTR]]
+// BL:  TemplateNoNSAN{{.*}}) [[NOATTR]]
+// NSAN: TemplateNoNSAN{{.*}}) [[NOATTR]]
+template<int i>
+__attribute__((no_sanitize("numerical")))
+int TemplateNoNSAN() { return i; }
+
+int force_instance = TemplateNSANOk<42>() + TemplateNoNSAN<42>();
+
+// WITHOUT: attributes [[NOATTR]] = { mustprogress noinline nounwind{{.*}} }
+// BL: attributes [[NOATTR]] = { mustprogress noinline nounwind{{.*}} }
+// NSAN: attributes [[WITH]] = { mustprogress noinline nounwind optnone sanitize_numerical_stability{{.*}} }
diff --git a/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp b/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp
index 81d93343565209..fb236aeb982e01 100644
--- a/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp
+++ b/clang/test/CodeGenCXX/attr-likelihood-if-vs-builtin-expect.cpp
@@ -221,5 +221,5 @@ void tu2(int &i) {
   }
 }
 
-// CHECK: [[BW_LIKELY]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
-// CHECK: [[BW_UNLIKELY]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
+// CHECK: [[BW_LIKELY]] = !{!"branch_weights", i32 2000, i32 1}
+// CHECK: [[BW_UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000}
diff --git a/clang/test/CodeGenObjC/encode-test-3.m b/clang/test/CodeGenObjC/encode-test-3.m
index 0856b770d65a50..30557fccf02df7 100644
--- a/clang/test/CodeGenObjC/encode-test-3.m
+++ b/clang/test/CodeGenObjC/encode-test-3.m
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -triple=i686-apple-darwin9 -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple=i686-apple-darwin9 -emit-llvm -o - %s -fexperimental-new-constant-interpreter | FileCheck %s
 
 int main(void) {
   int n;
diff --git a/clang/test/CodeGenOpenCL/builtins-f16.cl b/clang/test/CodeGenOpenCL/builtins-f16.cl
index adf7cdde154f51..d7bffdad5c548f 100644
--- a/clang/test/CodeGenOpenCL/builtins-f16.cl
+++ b/clang/test/CodeGenOpenCL/builtins-f16.cl
@@ -66,6 +66,9 @@ void test_half_builtins(half h0, half h1, half h2, int i0) {
   // CHECK: call half @llvm.sqrt.f16(half %h0)
   res = __builtin_sqrtf16(h0);
 
+  // CHECK: call half @llvm.tan.f16(half %h0)
+  res = __builtin_tanf16(h0);
+
   // CHECK: call half @llvm.trunc.f16(half %h0)
   res = __builtin_truncf16(h0);
 
diff --git a/clang/test/Driver/apple-os-triples.c b/clang/test/Driver/apple-os-triples.c
new file mode 100644
index 00000000000000..7664d3bc19fca2
--- /dev/null
+++ b/clang/test/Driver/apple-os-triples.c
@@ -0,0 +1,31 @@
+// Test triple manipulations.
+
+// RUN: %clang -### -c %s \
+// RUN:   --target=i386-apple-darwin10 -mappletvsimulator-version-min=9.0 -arch x86_64 2>&1 | \
+// RUN:   FileCheck %s -DARCH=x86_64 -DOS=tvos9.0.0-simulator
+// RUN: %clang -### -c %s \
+// RUN:   --target=armv7s-apple-darwin10 -mappletvos-version-min=9.0 -arch arm64 2>&1 | \
+// RUN:   FileCheck %s -DARCH=arm64 -DOS=tvos9.0.0
+// RUN: env TVOS_DEPLOYMENT_TARGET=9.0 %clang -### -c %s \
+// RUN:   -isysroot SDKs/MacOSX10.9.sdk -target i386-apple-darwin10  -arch x86_64 2>&1 | \
+// RUN:   FileCheck %s -DARCH=x86_64 -DOS=tvos9.0.0
+
+// RUN: %clang -### -c %s \
+// RUN:   --target=x86_64-apple-driverkit19.0 2>&1 | \
+// RUN:   FileCheck %s -DARCH=x86_64 -DOS=driverkit19.0.0
+
+// RUN: %clang -### -c %s \
+// RUN:   --target=i386-apple-darwin10 -miphonesimulator-version-min=7.0 -arch i386 2>&1 | \
+// RUN:   FileCheck %s -DARCH=i386 -DOS=ios7.0.0-simulator
+// RUN: %clang -### -c %s \
+// RUN:   --target=armv7s-apple-darwin10 -miphoneos-version-min=7.0 -arch armv7s 2>&1 | \
+// RUN:   FileCheck %s -DARCH=thumbv7s -DOS=ios7.0.0
+
+// RUN: %clang -### -c %s \
+// RUN:   --target=i386-apple-darwin10 -mwatchsimulator-version-min=2.0 -arch i386 2>&1 | \
+// RUN:   FileCheck %s -DARCH=i386 -DOS=watchos2.0.0-simulator
+// RUN: %clang -### -c %s \
+// RUN:   --target=armv7s-apple-darwin10 -mwatchos-version-min=2.0 -arch armv7k 2>&1 | \
+// RUN:   FileCheck %s -DARCH=thumbv7k -DOS=watchos2.0.0
+
+// CHECK: "-cc1" "-triple" "[[ARCH]]-apple-[[OS]]"
diff --git a/clang/test/Driver/appletvos-version-min.c b/clang/test/Driver/appletvos-version-min.c
deleted file mode 100644
index 7cbb2001a3ec21..00000000000000
--- a/clang/test/Driver/appletvos-version-min.c
+++ /dev/null
@@ -1,8 +0,0 @@
-// REQUIRES: x86-registered-target
-// REQUIRES: aarch64-registered-target
-// RUN: %clang -target i386-apple-darwin10 -mappletvsimulator-version-min=9.0 -arch x86_64 -S -o - %s | FileCheck %s
-// RUN: %clang -target armv7s-apple-darwin10 -mappletvos-version-min=9.0 -arch arm64 -S -o - %s | FileCheck %s
-// RUN: env TVOS_DEPLOYMENT_TARGET=9.0 %clang -isysroot SDKs/MacOSX10.9.sdk -target i386-apple-darwin10  -arch x86_64 -S -o - %s | FileCheck %s
-
-int main() { return 0; }
-// CHECK: .tvos_version_min 9, 0
diff --git a/clang/test/Driver/driverkit-version-min.c b/clang/test/Driver/driverkit-version-min.c
deleted file mode 100644
index 9966152f11ce82..00000000000000
--- a/clang/test/Driver/driverkit-version-min.c
+++ /dev/null
@@ -1,5 +0,0 @@
-// REQUIRES: x86-registered-target
-// RUN: %clang -target x86_64-apple-driverkit19.0 -S -o - %s | FileCheck %s
-
-int main() { return 0; }
-// CHECK: .build_version driverkit, 19, 0
diff --git a/clang/test/Driver/fsanitize.c b/clang/test/Driver/fsanitize.c
index 571f79a6e7f70d..ba64b3dcb11aa5 100644
--- a/clang/test/Driver/fsanitize.c
+++ b/clang/test/Driver/fsanitize.c
@@ -459,6 +459,21 @@
 // CHECK-TSAN-MSAN-MSAN-DARWIN: unsupported option '-fsanitize=memory' for target 'x86_64-apple-darwin10'
 // CHECK-TSAN-MSAN-MSAN-DARWIN-NOT: unsupported option
 
+// RUN: %clang --target=x86_64-linux-gnu -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-X86-64-LINUX
+// CHECK-NSAN-X86-64-LINUX: "-fsanitize=numerical"
+
+// RUN: %clang --target=aarch64-unknown-linux-gnu -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-AARCH64-LINUX
+// CHECK-NSAN-AARCH64-LINUX: "-fsanitize=numerical"
+
+// RUN: not %clang --target=mips-unknown-linux -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-MIPS-LINUX
+// CHECK-NSAN-MIPS-LINUX: error: unsupported option '-fsanitize=numerical' for target 'mips-unknown-linux'
+
+// RUN: %clang --target=x86_64-apple-macos -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-X86-64-MACOS
+// CHECK-NSAN-X86-64-MACOS: "-fsanitize=numerical"
+
+// RUN: %clang --target=arm64-apple-macos -fsanitize=numerical %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-NSAN-ARM64-MACOS
+// CHECK-NSAN-ARM64-MACOS: "-fsanitize=numerical"
+
 // RUN: %clang --target=x86_64-apple-darwin -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-DARWIN
 // CHECK-TSAN-X86-64-DARWIN-NOT: unsupported option
 // RUN: %clang --target=x86_64-apple-macos -fsanitize=thread %s -### 2>&1 | FileCheck %s --check-prefix=CHECK-TSAN-X86-64-MACOS
diff --git a/clang/test/Driver/ios-version-min.c b/clang/test/Driver/ios-version-min.c
deleted file mode 100644
index aa536cf7827b36..00000000000000
--- a/clang/test/Driver/ios-version-min.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// REQUIRES: x86-registered-target
-// REQUIRES: arm-registered-target
-// RUN: %clang -target i386-apple-darwin10 -miphonesimulator-version-min=7.0 -arch i386 -S -o - %s | FileCheck %s
-// RUN: %clang -target armv7s-apple-darwin10 -miphoneos-version-min=7.0 -arch armv7s -S -o - %s | FileCheck %s
-
-int main() { return 0; }
-// CHECK: .ios_version_min 7, 0
diff --git a/clang/test/Driver/riscv-arch.c b/clang/test/Driver/riscv-arch.c
index ddf617bbb62372..ffd92e1f398c45 100644
--- a/clang/test/Driver/riscv-arch.c
+++ b/clang/test/Driver/riscv-arch.c
@@ -231,11 +231,6 @@
 // RV32-STD: error: invalid arch name 'rv32imqc',
 // RV32-STD: unsupported standard user-level extension 'q'
 
-// RUN: not %clang --target=riscv32-unknown-elf -march=rv32ib -### %s \
-// RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32-B %s
-// RV32-B: error: invalid arch name 'rv32ib',
-// RV32-B: unsupported standard user-level extension 'b'
-
 // RUN: not %clang --target=riscv32-unknown-elf -march=rv32xabc -### %s \
 // RUN: -fsyntax-only 2>&1 | FileCheck -check-prefix=RV32X %s
 // RV32X: error: invalid arch name 'rv32xabc',
diff --git a/clang/test/Driver/watchos-version-min.c b/clang/test/Driver/watchos-version-min.c
deleted file mode 100644
index 8f12285d4e4737..00000000000000
--- a/clang/test/Driver/watchos-version-min.c
+++ /dev/null
@@ -1,7 +0,0 @@
-// REQUIRES: x86-registered-target
-// REQUIRES: arm-registered-target
-// RUN: %clang -target i386-apple-darwin10 -mwatchsimulator-version-min=2.0 -arch i386 -S -o - %s | FileCheck %s
-// RUN: %clang -target armv7s-apple-darwin10 -mwatchos-version-min=2.0 -arch armv7k -S -o - %s | FileCheck %s
-
-int main() { return 0; }
-// CHECK: .watchos_version_min 2, 0
diff --git a/clang/test/Lexer/has_feature_numerical_stability_sanitizer.cpp b/clang/test/Lexer/has_feature_numerical_stability_sanitizer.cpp
new file mode 100644
index 00000000000000..78884977322b8e
--- /dev/null
+++ b/clang/test/Lexer/has_feature_numerical_stability_sanitizer.cpp
@@ -0,0 +1,11 @@
+// RUN: %clang_cc1 -E -fsanitize=numerical %s -o - | FileCheck --check-prefix=CHECK-NSAN %s
+// RUN: %clang_cc1 -E  %s -o - | FileCheck --check-prefix=CHECK-NO-NSAN %s
+
+#if __has_feature(numerical_stability_sanitizer)
+int NumericalStabilitySanitizerEnabled();
+#else
+int NumericalStabilitySanitizerDisabled();
+#endif
+
+// CHECK-NSAN: NumericalStabilitySanitizerEnabled
+// CHECK-NO-NSAN: NumericalStabilitySanitizerDisabled
diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 09b9ad0a160bb1..91307141e0406b 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -7,6 +7,7 @@
 // CHECK-NOT: __riscv_64e {{.*$}}
 // CHECK-NOT: __riscv_a {{.*$}}
 // CHECK-NOT: __riscv_atomic
+// CHECK-NOT: __riscv_b {{.*$}}
 // CHECK-NOT: __riscv_c {{.*$}}
 // CHECK-NOT: __riscv_compressed {{.*$}}
 // CHECK-NOT: __riscv_d {{.*$}}
@@ -194,6 +195,17 @@
 // CHECK-A-EXT: __riscv_a 2001000{{$}}
 // CHECK-A-EXT: __riscv_atomic 1
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ib -x c -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-B-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ib -x c -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-B-EXT %s
+// CHECK-B-EXT: __riscv_b 1000000{{$}}
+// CHECK-B-EXT: __riscv_zba 1000000{{$}}
+// CHECK-B-EXT: __riscv_zbb 1000000{{$}}
+// CHECK-B-EXT: __riscv_zbs 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32ic -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-C-EXT %s
diff --git a/clang/test/Sema/aarch64-neon-target.c b/clang/test/Sema/aarch64-neon-target.c
index 642afddd88c154..fa45fff1d183d6 100644
--- a/clang/test/Sema/aarch64-neon-target.c
+++ b/clang/test/Sema/aarch64-neon-target.c
@@ -69,8 +69,8 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t
   vrnd_f16(v4f16); // expected-error {{always_inline function 'vrnd_f16' requires target feature 'fullfp16'}}
   vmaxnm_f16(v4f16, v4f16); // expected-error {{always_inline function 'vmaxnm_f16' requires target feature 'fullfp16'}}
   vrndi_f16(v4f16); // expected-error {{always_inline function 'vrndi_f16' requires target feature 'fullfp16'}}
-  // fp16fml depends on fp-armv8
-  vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'fp-armv8'}}
+  // fp16fml
+  vfmlal_low_f16(v2f32, v4f16, v4f16); // expected-error {{always_inline function 'vfmlal_low_f16' requires target feature 'fp16fml'}}
   // i8mm
   vmmlaq_s32(v4i32, v8i16, v8i16); // expected-error {{always_inline function 'vmmlaq_s32' requires target feature 'i8mm'}}
   vusdot_laneq_s32(v2i32, v8i8, v8i16, 0); // expected-error {{always_inline function 'vusdot_s32' requires target feature 'i8mm'}}
diff --git a/clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp b/clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp
new file mode 100644
index 00000000000000..cd1c076ab09e6b
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/ASTOpsTest.cpp
@@ -0,0 +1,88 @@
+//===- unittests/Analysis/FlowSensitive/ASTOpsTest.cpp --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/ASTOps.h"
+#include "TestingSupport.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <memory>
+
+namespace {
+
+using namespace clang;
+using namespace dataflow;
+
+using ast_matchers::cxxRecordDecl;
+using ast_matchers::hasName;
+using ast_matchers::hasType;
+using ast_matchers::initListExpr;
+using ast_matchers::match;
+using ast_matchers::selectFirst;
+using test::findValueDecl;
+using testing::IsEmpty;
+using testing::UnorderedElementsAre;
+
+TEST(ASTOpsTest, RecordInitListHelperOnEmptyUnionInitList) {
+  // This is a regression test: The `RecordInitListHelper` used to assert-fail
+  // when called for the `InitListExpr` of an empty union.
+  std::string Code = R"cc(
+    struct S {
+      S() : UField{} {};
+
+      union U {} UField;
+    };
+  )cc";
+  std::unique_ptr<ASTUnit> Unit =
+      tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++17"});
+  auto &ASTCtx = Unit->getASTContext();
+
+  ASSERT_EQ(ASTCtx.getDiagnostics().getClient()->getNumErrors(), 0U);
+
+  auto *InitList = selectFirst<InitListExpr>(
+      "init",
+      match(initListExpr(hasType(cxxRecordDecl(hasName("U")))).bind("init"),
+            ASTCtx));
+  ASSERT_NE(InitList, nullptr);
+
+  RecordInitListHelper Helper(InitList);
+  EXPECT_THAT(Helper.base_inits(), IsEmpty());
+  EXPECT_THAT(Helper.field_inits(), IsEmpty());
+}
+
+TEST(ASTOpsTest, ReferencedDeclsOnUnionInitList) {
+  // This is a regression test: `getReferencedDecls()` used to return a null
+  // `FieldDecl` in this case (in addition to the correct non-null `FieldDecl`)
+  // because `getInitializedFieldInUnion()` returns null for the syntactic form
+  // of the `InitListExpr`.
+  std::string Code = R"cc(
+    struct S {
+      S() : UField{0} {};
+
+      union U {
+        int I;
+      } UField;
+    };
+  )cc";
+  std::unique_ptr<ASTUnit> Unit =
+      tooling::buildASTFromCodeWithArgs(Code, {"-fsyntax-only", "-std=c++17"});
+  auto &ASTCtx = Unit->getASTContext();
+
+  ASSERT_EQ(ASTCtx.getDiagnostics().getClient()->getNumErrors(), 0U);
+
+  auto *InitList = selectFirst<InitListExpr>(
+      "init",
+      match(initListExpr(hasType(cxxRecordDecl(hasName("U")))).bind("init"),
+            ASTCtx));
+  ASSERT_NE(InitList, nullptr);
+  auto *IDecl = cast<FieldDecl>(findValueDecl(ASTCtx, "I"));
+
+  EXPECT_THAT(getReferencedDecls(*InitList).Fields,
+              UnorderedElementsAre(IDecl));
+}
+
+} // namespace
diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
index cfabb80576bc12..12fee5dc2789ce 100644
--- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_clang_unittest(ClangAnalysisFlowSensitiveTests
   ArenaTest.cpp
+  ASTOpsTest.cpp
   CFGMatchSwitchTest.cpp
   ChromiumCheckModelTest.cpp
   DataflowAnalysisContextTest.cpp
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index f7e6b0c22e8db2..2a74d7fa63fd74 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -3345,6 +3345,32 @@ TEST(TransferTest, ResultObjectLocationForBuiltinBitCastExpr) {
       });
 }
 
+TEST(TransferTest, ResultObjectLocationForAtomicExpr) {
+  std::string Code = R"(
+    struct S {};
+    void target(_Atomic(S) *ptr) {
+      S s = __c11_atomic_load(ptr, __ATOMIC_SEQ_CST);
+      // [[p]]
+    }
+  )";
+  using ast_matchers::atomicExpr;
+  using ast_matchers::match;
+  using ast_matchers::selectFirst;
+  using ast_matchers::traverse;
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        auto *Atomic = selectFirst<AtomicExpr>(
+            "atomic", match(atomicExpr().bind("atomic"), ASTCtx));
+
+        EXPECT_EQ(&Env.getResultObjectLocation(*Atomic),
+                  &getLocForDecl<RecordStorageLocation>(ASTCtx, Env, "s"));
+      });
+}
+
 TEST(TransferTest, ResultObjectLocationPropagatesThroughConditionalOperator) {
   std::string Code = R"(
     struct A {
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h b/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h
index 5f14d7781004b3..3707273e0cbd48 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Inquiry.h
@@ -20,12 +20,14 @@ class FirOpBuilder;
 
 namespace fir::runtime {
 
-/// Generate call to general `LboundDim` runtime routine.  Calls to LBOUND
-/// without a DIM argument get transformed into descriptor inquiries so they're
-/// not handled in the runtime.
+/// Generate call to `LboundDim` runtime routine.
 mlir::Value genLboundDim(fir::FirOpBuilder &builder, mlir::Location loc,
                          mlir::Value array, mlir::Value dim);
 
+/// Generate call to Lbound` runtime routine.
+void genLbound(fir::FirOpBuilder &builder, mlir::Location loc,
+               mlir::Value resultAddr, mlir::Value arrayt, mlir::Value kind);
+
 /// Generate call to general `Ubound` runtime routine.  Calls to UBOUND
 /// with a DIM argument get transformed into an expression equivalent to
 /// SIZE() + LBOUND() - 1, so they don't have an intrinsic in the runtime.
diff --git a/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h b/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h
index 27652208b524ee..fedf453a6dc8de 100644
--- a/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h
+++ b/flang/include/flang/Optimizer/Builder/Runtime/Reduction.h
@@ -240,6 +240,13 @@ mlir::Value genReduce(fir::FirOpBuilder &builder, mlir::Location loc,
                       mlir::Value maskBox, mlir::Value identity,
                       mlir::Value ordered);
 
+/// Generate call to `Reduce` intrinsic runtime routine. This is the version
+/// that takes arrays of any rank with a dim argument specified.
+void genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc,
+                  mlir::Value arrayBox, mlir::Value operation, mlir::Value dim,
+                  mlir::Value maskBox, mlir::Value identity,
+                  mlir::Value ordered, mlir::Value resultBox);
+
 } // namespace fir::runtime
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_REDUCTION_H
diff --git a/flang/lib/Optimizer/CodeGen/DescriptorModel.h b/flang/include/flang/Optimizer/CodeGen/DescriptorModel.h
similarity index 88%
rename from flang/lib/Optimizer/CodeGen/DescriptorModel.h
rename to flang/include/flang/Optimizer/CodeGen/DescriptorModel.h
index ed35caef930149..ff0cf29e8073e6 100644
--- a/flang/lib/Optimizer/CodeGen/DescriptorModel.h
+++ b/flang/include/flang/Optimizer/CodeGen/DescriptorModel.h
@@ -35,73 +35,73 @@ using TypeBuilderFunc = mlir::Type (*)(mlir::MLIRContext *);
 
 /// Get the LLVM IR dialect model for building a particular C++ type, `T`.
 template <typename T>
-TypeBuilderFunc getModel();
+static TypeBuilderFunc getModel();
 
 template <>
-TypeBuilderFunc getModel<void *>() {
+constexpr TypeBuilderFunc getModel<void *>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::LLVM::LLVMPointerType::get(context);
   };
 }
 template <>
-TypeBuilderFunc getModel<unsigned>() {
+constexpr TypeBuilderFunc getModel<unsigned>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context, sizeof(unsigned) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<int>() {
+constexpr TypeBuilderFunc getModel<int>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context, sizeof(int) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<unsigned long>() {
+constexpr TypeBuilderFunc getModel<unsigned long>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context, sizeof(unsigned long) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<unsigned long long>() {
+constexpr TypeBuilderFunc getModel<unsigned long long>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context, sizeof(unsigned long long) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<long long>() {
+constexpr TypeBuilderFunc getModel<long long>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context, sizeof(long long) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<Fortran::ISO::CFI_rank_t>() {
+constexpr TypeBuilderFunc getModel<Fortran::ISO::CFI_rank_t>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context,
                                   sizeof(Fortran::ISO::CFI_rank_t) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<Fortran::ISO::CFI_type_t>() {
+constexpr TypeBuilderFunc getModel<Fortran::ISO::CFI_type_t>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context,
                                   sizeof(Fortran::ISO::CFI_type_t) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<long>() {
+constexpr TypeBuilderFunc getModel<long>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     return mlir::IntegerType::get(context, sizeof(long) * 8);
   };
 }
 template <>
-TypeBuilderFunc getModel<Fortran::ISO::CFI_dim_t>() {
+constexpr TypeBuilderFunc getModel<Fortran::ISO::CFI_dim_t>() {
   return [](mlir::MLIRContext *context) -> mlir::Type {
     auto indexTy = getModel<Fortran::ISO::CFI_index_t>()(context);
     return mlir::LLVM::LLVMArrayType::get(indexTy, 3);
   };
 }
 template <>
-TypeBuilderFunc
+constexpr TypeBuilderFunc
 getModel<Fortran::ISO::cfi_internal::FlexibleArray<Fortran::ISO::CFI_dim_t>>() {
   return getModel<Fortran::ISO::CFI_dim_t>();
 }
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 6101730ce17280..4cdf1f2d98caa4 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -5790,7 +5790,17 @@ IntrinsicLibrary::genReduce(mlir::Type resultType,
     return fir::runtime::genReduce(builder, loc, array, operation, mask,
                                    identity, ordered);
   }
-  TODO(loc, "reduce with array result");
+  // Handle cases that have an array result.
+  // Create mutable fir.box to be passed to the runtime for the result.
+  mlir::Type resultArrayType = builder.getVarLenSeqTy(resultType, rank - 1);
+  fir::MutableBoxValue resultMutableBox =
+      fir::factory::createTempMutableBox(builder, loc, resultArrayType);
+  mlir::Value resultIrBox =
+      fir::factory::getMutableIRBox(builder, loc, resultMutableBox);
+  mlir::Value dim = fir::getBase(args[2]);
+  fir::runtime::genReduceDim(builder, loc, array, operation, dim, mask,
+                             identity, ordered, resultIrBox);
+  return readAndAddCleanUp(resultMutableBox, resultType, "REDUCE");
 }
 
 // REPEAT
@@ -6352,16 +6362,17 @@ IntrinsicLibrary::genLbound(mlir::Type resultType,
                             llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 2 || args.size() == 3);
   const fir::ExtendedValue &array = args[0];
-  if (const auto *boxValue = array.getBoxOf<fir::BoxValue>())
-    if (boxValue->hasAssumedRank())
-      TODO(loc, "intrinsic: lbound with assumed rank argument");
+  // Semantics builds signatures for LBOUND calls as either
+  // LBOUND(array, dim, [kind]) or LBOUND(array, [kind]).
+  const bool dimIsAbsent = args.size() == 2 || isStaticallyAbsent(args, 1);
+  if (array.hasAssumedRank() && dimIsAbsent)
+    return genAssumedRankBoundInquiry(builder, loc, resultType, args,
+                                      /*kindPos=*/1, fir::runtime::genLbound);
 
   mlir::Type indexType = builder.getIndexType();
 
-  // Semantics builds signatures for LBOUND calls as either
-  // LBOUND(array, dim, [kind]) or LBOUND(array, [kind]).
-  if (args.size() == 2 || isStaticallyAbsent(args, 1)) {
-    // DIM is absent.
+  if (dimIsAbsent) {
+    // DIM is absent and the rank of array is a compile time constant.
     mlir::Type lbType = fir::unwrapSequenceType(resultType);
     unsigned rank = array.rank();
     mlir::Type lbArrayType = fir::SequenceType::get(
@@ -6386,13 +6397,16 @@ IntrinsicLibrary::genLbound(mlir::Type resultType,
   // DIM is present.
   mlir::Value dim = fir::getBase(args[1]);
 
-  // If it is a compile time constant, skip the runtime call.
-  if (std::optional<std::int64_t> cstDim = fir::getIntIfConstant(dim)) {
-    mlir::Value one = builder.createIntegerConstant(loc, resultType, 1);
-    mlir::Value zero = builder.createIntegerConstant(loc, indexType, 0);
-    mlir::Value lb = computeLBOUND(builder, loc, array, *cstDim - 1, zero, one);
-    return builder.createConvert(loc, resultType, lb);
-  }
+  // If it is a compile time constant and the rank is known, skip the runtime
+  // call.
+  if (!array.hasAssumedRank())
+    if (std::optional<std::int64_t> cstDim = fir::getIntIfConstant(dim)) {
+      mlir::Value one = builder.createIntegerConstant(loc, resultType, 1);
+      mlir::Value zero = builder.createIntegerConstant(loc, indexType, 0);
+      mlir::Value lb =
+          computeLBOUND(builder, loc, array, *cstDim - 1, zero, one);
+      return builder.createConvert(loc, resultType, lb);
+    }
 
   fir::ExtendedValue box = createBoxForRuntimeBoundInquiry(loc, builder, array);
   return builder.createConvert(
diff --git a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
index 34c4020b5907c9..e01a6f05b5fdd8 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Inquiry.cpp
@@ -29,6 +29,20 @@ mlir::Value fir::runtime::genLboundDim(fir::FirOpBuilder &builder,
   return builder.create<fir::CallOp>(loc, lboundFunc, args).getResult(0);
 }
 
+void fir::runtime::genLbound(fir::FirOpBuilder &builder, mlir::Location loc,
+                             mlir::Value resultAddr, mlir::Value array,
+                             mlir::Value kind) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(Lbound)>(loc, builder);
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
+  auto args = fir::runtime::createArguments(
+      builder, loc, fTy, resultAddr, array, kind, sourceFile, sourceLine);
+  builder.create<fir::CallOp>(loc, func, args).getResult(0);
+}
+
 /// Generate call to `Ubound` runtime routine.  Calls to UBOUND with a DIM
 /// argument get transformed into an expression equivalent to
 /// SIZE() + LBOUND() - 1, so they don't have an intrinsic in the runtime.
diff --git a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
index 0a280816ffcc83..4b086a98de47b2 100644
--- a/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
+++ b/flang/lib/Optimizer/Builder/Runtime/Reduction.cpp
@@ -505,6 +505,50 @@ struct ForcedReduceReal16 {
   }
 };
 
+/// Placeholder for DIM real*10 version of Reduce Intrinsic
+struct ForcedReduceReal10Dim {
+  static constexpr const char *name =
+      ExpandAndQuoteKey(RTNAME(ReduceReal10Dim));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::FloatType::getF80(ctx);
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      auto refTy = fir::ReferenceType::get(ty);
+      auto refBoxTy = fir::ReferenceType::get(boxTy);
+      auto i1Ty = mlir::IntegerType::get(ctx, 1);
+      return mlir::FunctionType::get(
+          ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty},
+          {});
+    };
+  }
+};
+
+/// Placeholder for DIM real*16 version of Reduce Intrinsic
+struct ForcedReduceReal16Dim {
+  static constexpr const char *name =
+      ExpandAndQuoteKey(RTNAME(ReduceReal16Dim));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::FloatType::getF128(ctx);
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      auto refTy = fir::ReferenceType::get(ty);
+      auto refBoxTy = fir::ReferenceType::get(boxTy);
+      auto i1Ty = mlir::IntegerType::get(ctx, 1);
+      return mlir::FunctionType::get(
+          ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty},
+          {});
+    };
+  }
+};
+
 /// Placeholder for integer*16 version of Reduce Intrinsic
 struct ForcedReduceInteger16 {
   static constexpr const char *name =
@@ -525,6 +569,28 @@ struct ForcedReduceInteger16 {
   }
 };
 
+/// Placeholder for DIM integer*16 version of Reduce Intrinsic
+struct ForcedReduceInteger16Dim {
+  static constexpr const char *name =
+      ExpandAndQuoteKey(RTNAME(ReduceInteger16Dim));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::IntegerType::get(ctx, 128);
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      auto refTy = fir::ReferenceType::get(ty);
+      auto refBoxTy = fir::ReferenceType::get(boxTy);
+      auto i1Ty = mlir::IntegerType::get(ctx, 1);
+      return mlir::FunctionType::get(
+          ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty},
+          {});
+    };
+  }
+};
+
 /// Placeholder for complex(10) version of Reduce Intrinsic
 struct ForcedReduceComplex10 {
   static constexpr const char *name =
@@ -546,6 +612,28 @@ struct ForcedReduceComplex10 {
   }
 };
 
+/// Placeholder for Dim complex(10) version of Reduce Intrinsic
+struct ForcedReduceComplex10Dim {
+  static constexpr const char *name =
+      ExpandAndQuoteKey(RTNAME(CppReduceComplex10Dim));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::ComplexType::get(mlir::FloatType::getF80(ctx));
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      auto refTy = fir::ReferenceType::get(ty);
+      auto refBoxTy = fir::ReferenceType::get(boxTy);
+      auto i1Ty = mlir::IntegerType::get(ctx, 1);
+      return mlir::FunctionType::get(
+          ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty},
+          {});
+    };
+  }
+};
+
 /// Placeholder for complex(16) version of Reduce Intrinsic
 struct ForcedReduceComplex16 {
   static constexpr const char *name =
@@ -567,6 +655,28 @@ struct ForcedReduceComplex16 {
   }
 };
 
+/// Placeholder for Dim complex(16) version of Reduce Intrinsic
+struct ForcedReduceComplex16Dim {
+  static constexpr const char *name =
+      ExpandAndQuoteKey(RTNAME(CppReduceComplex16Dim));
+  static constexpr fir::runtime::FuncTypeBuilderFunc getTypeModel() {
+    return [](mlir::MLIRContext *ctx) {
+      auto ty = mlir::ComplexType::get(mlir::FloatType::getF128(ctx));
+      auto boxTy =
+          fir::runtime::getModel<const Fortran::runtime::Descriptor &>()(ctx);
+      auto opTy = mlir::FunctionType::get(ctx, {ty, ty}, ty);
+      auto strTy = fir::ReferenceType::get(mlir::IntegerType::get(ctx, 8));
+      auto intTy = mlir::IntegerType::get(ctx, 8 * sizeof(int));
+      auto refTy = fir::ReferenceType::get(ty);
+      auto refBoxTy = fir::ReferenceType::get(boxTy);
+      auto i1Ty = mlir::IntegerType::get(ctx, 1);
+      return mlir::FunctionType::get(
+          ctx, {refBoxTy, boxTy, opTy, strTy, intTy, intTy, boxTy, refTy, i1Ty},
+          {});
+    };
+  }
+};
+
 /// Generate call to specialized runtime function that takes a mask and
 /// dim argument. The All, Any, and Count intrinsics use this pattern.
 template <typename FN>
@@ -1461,3 +1571,97 @@ mlir::Value fir::runtime::genReduce(fir::FirOpBuilder &builder,
                                             maskBox, identity, ordered);
   return builder.create<fir::CallOp>(loc, func, args).getResult(0);
 }
+
+void fir::runtime::genReduceDim(fir::FirOpBuilder &builder, mlir::Location loc,
+                                mlir::Value arrayBox, mlir::Value operation,
+                                mlir::Value dim, mlir::Value maskBox,
+                                mlir::Value identity, mlir::Value ordered,
+                                mlir::Value resultBox) {
+  mlir::func::FuncOp func;
+  auto ty = arrayBox.getType();
+  auto arrTy = fir::dyn_cast_ptrOrBoxEleTy(ty);
+  auto eleTy = mlir::cast<fir::SequenceType>(arrTy).getEleTy();
+
+  mlir::MLIRContext *ctx = builder.getContext();
+  fir::factory::CharacterExprHelper charHelper{builder, loc};
+
+  if (eleTy.isF16())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal2Dim)>(loc, builder);
+  else if (eleTy.isBF16())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal3Dim)>(loc, builder);
+  else if (eleTy.isF32())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal4Dim)>(loc, builder);
+  else if (eleTy.isF64())
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceReal8Dim)>(loc, builder);
+  else if (eleTy.isF80())
+    func = fir::runtime::getRuntimeFunc<ForcedReduceReal10Dim>(loc, builder);
+  else if (eleTy.isF128())
+    func = fir::runtime::getRuntimeFunc<ForcedReduceReal16Dim>(loc, builder);
+  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(1)))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger1Dim)>(loc, builder);
+  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(2)))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger2Dim)>(loc, builder);
+  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(4)))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger4Dim)>(loc, builder);
+  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(8)))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceInteger8Dim)>(loc, builder);
+  else if (eleTy.isInteger(builder.getKindMap().getIntegerBitsize(16)))
+    func = fir::runtime::getRuntimeFunc<ForcedReduceInteger16Dim>(loc, builder);
+  else if (eleTy == fir::ComplexType::get(ctx, 2))
+    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex2Dim)>(loc,
+                                                                       builder);
+  else if (eleTy == fir::ComplexType::get(ctx, 3))
+    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex3Dim)>(loc,
+                                                                       builder);
+  else if (eleTy == fir::ComplexType::get(ctx, 4))
+    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex4Dim)>(loc,
+                                                                       builder);
+  else if (eleTy == fir::ComplexType::get(ctx, 8))
+    func = fir::runtime::getRuntimeFunc<mkRTKey(CppReduceComplex8Dim)>(loc,
+                                                                       builder);
+  else if (eleTy == fir::ComplexType::get(ctx, 10))
+    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex10Dim>(loc, builder);
+  else if (eleTy == fir::ComplexType::get(ctx, 16))
+    func = fir::runtime::getRuntimeFunc<ForcedReduceComplex16Dim>(loc, builder);
+  else if (eleTy == fir::LogicalType::get(ctx, 1))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical1Dim)>(loc, builder);
+  else if (eleTy == fir::LogicalType::get(ctx, 2))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical2Dim)>(loc, builder);
+  else if (eleTy == fir::LogicalType::get(ctx, 4))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical4Dim)>(loc, builder);
+  else if (eleTy == fir::LogicalType::get(ctx, 8))
+    func =
+        fir::runtime::getRuntimeFunc<mkRTKey(ReduceLogical8Dim)>(loc, builder);
+  else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 1)
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceCharacter1Dim)>(loc,
+                                                                      builder);
+  else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 2)
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceCharacter2Dim)>(loc,
+                                                                      builder);
+  else if (fir::isa_char(eleTy) && charHelper.getCharacterKind(eleTy) == 4)
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceCharacter4Dim)>(loc,
+                                                                      builder);
+  else if (fir::isa_derived(eleTy))
+    func = fir::runtime::getRuntimeFunc<mkRTKey(ReduceDerivedTypeDim)>(loc,
+                                                                       builder);
+  else
+    fir::intrinsicTypeTODO(builder, eleTy, loc, "REDUCE");
+
+  auto fTy = func.getFunctionType();
+  auto sourceFile = fir::factory::locationToFilename(builder, loc);
+
+  auto sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(4));
+  auto opAddr = builder.create<fir::BoxAddrOp>(loc, fTy.getInput(2), operation);
+  auto args = fir::runtime::createArguments(
+      builder, loc, fTy, resultBox, arrayBox, opAddr, sourceFile, sourceLine,
+      dim, maskBox, identity, ordered);
+  builder.create<fir::CallOp>(loc, func, args);
+}
diff --git a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
index 07d3bd713ce45d..501a36f5b68ba6 100644
--- a/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
+++ b/flang/lib/Optimizer/CodeGen/TypeConverter.cpp
@@ -13,9 +13,9 @@
 #define DEBUG_TYPE "flang-type-conversion"
 
 #include "flang/Optimizer/CodeGen/TypeConverter.h"
-#include "DescriptorModel.h"
 #include "flang/Common/Fortran.h"
 #include "flang/Optimizer/Builder/Todo.h" // remove when TODO's are done
+#include "flang/Optimizer/CodeGen/DescriptorModel.h"
 #include "flang/Optimizer/CodeGen/TBAABuilder.h"
 #include "flang/Optimizer/CodeGen/Target.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index a174f2c2bc4bfd..53745d10fe9e4d 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -13,15 +13,55 @@
 #define DEBUG_TYPE "flang-debug-type-generator"
 
 #include "DebugTypeGenerator.h"
+#include "flang/Optimizer/CodeGen/DescriptorModel.h"
+#include "flang/Optimizer/CodeGen/TypeConverter.h"
+#include "flang/Optimizer/Support/DataLayout.h"
+#include "mlir/Pass/Pass.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/Debug.h"
 
 namespace fir {
 
+/// Calculate offset of any field in the descriptor.
+template <int DescriptorField>
+std::uint64_t getComponentOffset(const mlir::DataLayout &dl,
+                                 mlir::MLIRContext *context,
+                                 mlir::Type llvmFieldType) {
+  static_assert(DescriptorField > 0 && DescriptorField < 10);
+  mlir::Type previousFieldType =
+      getDescFieldTypeModel<DescriptorField - 1>()(context);
+  std::uint64_t previousOffset =
+      getComponentOffset<DescriptorField - 1>(dl, context, previousFieldType);
+  std::uint64_t offset = previousOffset + dl.getTypeSize(previousFieldType);
+  std::uint64_t fieldAlignment = dl.getTypeABIAlignment(llvmFieldType);
+  return llvm::alignTo(offset, fieldAlignment);
+}
+template <>
+std::uint64_t getComponentOffset<0>(const mlir::DataLayout &dl,
+                                    mlir::MLIRContext *context,
+                                    mlir::Type llvmFieldType) {
+  return 0;
+}
+
 DebugTypeGenerator::DebugTypeGenerator(mlir::ModuleOp m)
     : module(m), kindMapping(getKindMapping(m)) {
   LLVM_DEBUG(llvm::dbgs() << "DITypeAttr generator\n");
+
+  std::optional<mlir::DataLayout> dl =
+      fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/true);
+  if (!dl) {
+    mlir::emitError(module.getLoc(), "Missing data layout attribute in module");
+    return;
+  }
+
+  mlir::MLIRContext *context = module.getContext();
+
+  // The debug information requires the offset of certain fields in the
+  // descriptors like lower_bound and extent for each dimension.
+  mlir::Type llvmDimsType = getDescFieldTypeModel<kDimsPosInBox>()(context);
+  dimsOffset = getComponentOffset<kDimsPosInBox>(*dl, context, llvmDimsType);
+  dimsSize = dl->getTypeSize(llvmDimsType);
 }
 
 static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
@@ -37,10 +77,82 @@ static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
                       llvm::dwarf::DW_ATE_signed);
 }
 
+mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
+    fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr,
+    mlir::LLVM::DIScopeAttr scope, mlir::Location loc, bool genAllocated,
+    bool genAssociated) {
+
+  mlir::MLIRContext *context = module.getContext();
+  // FIXME: Assumed rank arrays not supported yet
+  if (seqTy.hasUnknownShape())
+    return genPlaceholderType(context);
+
+  llvm::SmallVector<mlir::LLVM::DIExpressionElemAttr> ops;
+  auto addOp = [&](unsigned opc, llvm::ArrayRef<uint64_t> vals) {
+    ops.push_back(mlir::LLVM::DIExpressionElemAttr::get(context, opc, vals));
+  };
+
+  addOp(llvm::dwarf::DW_OP_push_object_address, {});
+  addOp(llvm::dwarf::DW_OP_deref, {});
+
+  // dataLocation = *base_addr
+  mlir::LLVM::DIExpressionAttr dataLocation =
+      mlir::LLVM::DIExpressionAttr::get(context, ops);
+  addOp(llvm::dwarf::DW_OP_lit0, {});
+  addOp(llvm::dwarf::DW_OP_ne, {});
+
+  // allocated = associated = (*base_addr != 0)
+  mlir::LLVM::DIExpressionAttr valid =
+      mlir::LLVM::DIExpressionAttr::get(context, ops);
+  mlir::LLVM::DIExpressionAttr associated = genAllocated ? valid : nullptr;
+  mlir::LLVM::DIExpressionAttr allocated = genAssociated ? valid : nullptr;
+  ops.clear();
+
+  llvm::SmallVector<mlir::LLVM::DINodeAttr> elements;
+  mlir::LLVM::DITypeAttr elemTy =
+      convertType(seqTy.getEleTy(), fileAttr, scope, loc);
+  unsigned offset = dimsOffset;
+  const unsigned indexSize = dimsSize / 3;
+  for ([[maybe_unused]] auto _ : seqTy.getShape()) {
+    // For each dimension, find the offset of count and lower bound in the
+    // descriptor and generate the dwarf expression to extract it.
+    // FIXME: If `indexSize` happens to be bigger than address size on the
+    // system then we may have to change 'DW_OP_deref' here.
+    addOp(llvm::dwarf::DW_OP_push_object_address, {});
+    addOp(llvm::dwarf::DW_OP_plus_uconst,
+          {offset + (indexSize * kDimExtentPos)});
+    addOp(llvm::dwarf::DW_OP_deref, {});
+    // count[i] = *(base_addr + offset + (indexSize * kDimExtentPos))
+    // where 'offset' is dimsOffset + (i * dimsSize)
+    mlir::LLVM::DIExpressionAttr countAttr =
+        mlir::LLVM::DIExpressionAttr::get(context, ops);
+    ops.clear();
+
+    addOp(llvm::dwarf::DW_OP_push_object_address, {});
+    addOp(llvm::dwarf::DW_OP_plus_uconst,
+          {offset + (indexSize * kDimLowerBoundPos)});
+    addOp(llvm::dwarf::DW_OP_deref, {});
+    // lower_bound[i] = *(base_addr + offset + (indexSize * kDimLowerBoundPos))
+    mlir::LLVM::DIExpressionAttr lowerAttr =
+        mlir::LLVM::DIExpressionAttr::get(context, ops);
+    ops.clear();
+
+    offset += dimsSize;
+    mlir::LLVM::DISubrangeAttr subrangeTy = mlir::LLVM::DISubrangeAttr::get(
+        context, nullptr, lowerAttr, countAttr, nullptr);
+    elements.push_back(subrangeTy);
+  }
+  return mlir::LLVM::DICompositeTypeAttr::get(
+      context, llvm::dwarf::DW_TAG_array_type, /*recursive id*/ {},
+      /* name */ nullptr, /* file */ nullptr, /* line */ 0,
+      /* scope */ nullptr, elemTy, mlir::LLVM::DIFlags::Zero,
+      /* sizeInBits */ 0, /*alignInBits*/ 0, elements, dataLocation,
+      /* rank */ nullptr, allocated, associated);
+}
+
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
     fir::SequenceType seqTy, mlir::LLVM::DIFileAttr fileAttr,
     mlir::LLVM::DIScopeAttr scope, mlir::Location loc) {
-
   mlir::MLIRContext *context = module.getContext();
   // FIXME: Only fixed sizes arrays handled at the moment.
   if (seqTy.hasDynamicExtents())
@@ -112,6 +224,12 @@ DebugTypeGenerator::convertType(mlir::Type Ty, mlir::LLVM::DIFileAttr fileAttr,
                         bitWidth * 2, llvm::dwarf::DW_ATE_complex_float);
   } else if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(Ty)) {
     return convertSequenceType(seqTy, fileAttr, scope, loc);
+  } else if (auto boxTy = mlir::dyn_cast_or_null<fir::BoxType>(Ty)) {
+    auto elTy = boxTy.getElementType();
+    if (auto seqTy = mlir::dyn_cast_or_null<fir::SequenceType>(elTy))
+      return convertBoxedSequenceType(seqTy, fileAttr, scope, loc, false,
+                                      false);
+    return genPlaceholderType(context);
   } else {
     // FIXME: These types are currently unhandled. We are generating a
     // placeholder type to allow us to test supported bits.
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
index 963c919d66825c..11515d11dfed63 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.h
@@ -35,8 +35,20 @@ class DebugTypeGenerator {
                                              mlir::LLVM::DIFileAttr fileAttr,
                                              mlir::LLVM::DIScopeAttr scope,
                                              mlir::Location loc);
+
+  /// The 'genAllocated' is true when we want to generate 'allocated' field
+  /// in the DICompositeType. It is needed for the allocatable arrays.
+  /// Similarly, 'genAssociated' is used with 'pointer' type to generate
+  /// 'associated' field.
+  mlir::LLVM::DITypeAttr
+  convertBoxedSequenceType(fir::SequenceType seqTy,
+                           mlir::LLVM::DIFileAttr fileAttr,
+                           mlir::LLVM::DIScopeAttr scope, mlir::Location loc,
+                           bool genAllocated, bool genAssociated);
   mlir::ModuleOp module;
   KindMapping kindMapping;
+  std::uint64_t dimsSize;
+  std::uint64_t dimsOffset;
 };
 
 } // namespace fir
diff --git a/flang/test/Integration/debug-assumed-shape-array.f90 b/flang/test/Integration/debug-assumed-shape-array.f90
new file mode 100644
index 00000000000000..7b0801c12dba11
--- /dev/null
+++ b/flang/test/Integration/debug-assumed-shape-array.f90
@@ -0,0 +1,13 @@
+! RUN: %flang_fc1 -emit-llvm -debug-info-kind=standalone %s -o - | FileCheck  %s
+
+subroutine ff(arr)
+  implicit none
+    integer :: arr(:, :)
+    return arr(1,1)
+end subroutine ff
+
+! CHECK-DAG: !DICompositeType(tag: DW_TAG_array_type{{.*}}elements: ![[ELEMS:[0-9]+]], dataLocation: !DIExpression(DW_OP_push_object_address, DW_OP_deref))
+! CHECK-DAG: ![[ELEMS]] = !{![[ELEM1:[0-9]+]], ![[ELEM2:[0-9]+]]}
+! CHECK-DAG: ![[ELEM1]] = !DISubrange(lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 24, DW_OP_deref), upperBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 32, DW_OP_deref))
+! CHECK-DAG: ![[ELEM2]] = !DISubrange(lowerBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 48, DW_OP_deref), upperBound: !DIExpression(DW_OP_push_object_address, DW_OP_plus_uconst, 56, DW_OP_deref))
+
diff --git a/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90 b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
index bbeff5ff051915..e568b94f4f8843 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-inquiries-3.f90
@@ -54,3 +54,58 @@ subroutine test_shape_2(x)
 ! CHECK:           %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> index
 ! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_14]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+
+
+subroutine test_lbound(x)
+  real :: x(..)
+  call takes_integer_array(lbound(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_lbound(
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<15xi32>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 4 : i32
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
+! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_13]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
+! CHECK:           %[[VAL_15:.*]] = arith.constant false
+! CHECK:           %[[VAL_16:.*]] = hlfir.as_expr %[[VAL_14]]#0 move %[[VAL_15]] : (!fir.box<!fir.array<?xi32>>, i1) -> !hlfir.expr<?xi32>
+! CHECK:           %[[VAL_17:.*]]:3 = hlfir.associate %[[VAL_16]](%[[VAL_13]]) {adapt.valuebyref} : (!hlfir.expr<?xi32>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>, i1)
+! CHECK:           fir.call @_QPtakes_integer_array(%[[VAL_17]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xi32>>) -> ()
+! CHECK:           hlfir.end_associate %[[VAL_17]]#1, %[[VAL_17]]#2 : !fir.ref<!fir.array<?xi32>>, i1
+! CHECK:           hlfir.destroy %[[VAL_16]] : !hlfir.expr<?xi32>
+! CHECK:           return
+! CHECK:         }
+
+subroutine test_lbound_kind(x)
+  real :: x(..)
+  call takes_integer8_array(lbound(x, kind=8))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_lbound_kind(
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<15xi64>
+! CHECK:           %[[VAL_4:.*]] = arith.constant 8 : i32
+! CHECK:           %[[VAL_7:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.llvm_ptr<i8>
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_3:.*]] : (!fir.box<!fir.array<*:f32>>) -> !fir.box<none>
+! CHECK:           %[[VAL_10:.*]] = fir.call @_FortranALbound(%[[VAL_7]], %[[VAL_8]], %[[VAL_4]], %{{.*}}, %{{.*}})
+! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi64>>) -> !fir.ref<!fir.array<?xi64>>
+! CHECK:           %[[VAL_12:.*]] = fir.box_rank %[[VAL_3]] : (!fir.box<!fir.array<*:f32>>) -> index
+! CHECK:           %[[VAL_13:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_14:.*]]:2 = hlfir.declare %[[VAL_11]](%[[VAL_13]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.array<?xi64>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi64>>, !fir.ref<!fir.array<?xi64>>)
+
+subroutine test_lbound_2(x)
+  real, pointer :: x(..)
+  call takes_integer_array(lbound(x))
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest_lbound_2(
+! CHECK:           %[[VAL_1:.*]] = fir.alloca !fir.array<15xi32>
+! CHECK:           %[[VAL_4:.*]] = fir.load %[[VAL_3:.*]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<*:f32>>>>
+! CHECK:           %[[VAL_5:.*]] = arith.constant 4 : i32
+! CHECK:           %[[VAL_8:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.llvm_ptr<i8>
+! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> !fir.box<none>
+! CHECK:           %[[VAL_11:.*]] = fir.call @_FortranALbound(%[[VAL_8]], %[[VAL_9]], %[[VAL_5]], %{{.*}}, %{{.*}})
+! CHECK:           %[[VAL_12:.*]] = fir.convert %[[VAL_1]] : (!fir.ref<!fir.array<15xi32>>) -> !fir.ref<!fir.array<?xi32>>
+! CHECK:           %[[VAL_13:.*]] = fir.box_rank %[[VAL_4]] : (!fir.box<!fir.ptr<!fir.array<*:f32>>>) -> index
+! CHECK:           %[[VAL_14:.*]] = fir.shape %[[VAL_13]] : (index) -> !fir.shape<1>
+! CHECK:           %[[VAL_15:.*]]:2 = hlfir.declare %[[VAL_12]](%[[VAL_14]]) {uniq_name = ".tmp.intrinsic_result"} : (!fir.ref<!fir.array<?xi32>>, !fir.shape<1>) -> (!fir.box<!fir.array<?xi32>>, !fir.ref<!fir.array<?xi32>>)
diff --git a/flang/test/Lower/Intrinsics/reduce.f90 b/flang/test/Lower/Intrinsics/reduce.f90
index 36900abaa79f8c..842e626d7cc397 100644
--- a/flang/test/Lower/Intrinsics/reduce.f90
+++ b/flang/test/Lower/Intrinsics/reduce.f90
@@ -392,4 +392,225 @@ subroutine testtype(a)
 
 ! CHECK: fir.call @_FortranAReduceDerivedType
 
+subroutine integer1dim(a, id)
+  integer(1), intent(in) :: a(:,:)
+  integer(1), allocatable :: res(:)
+
+  res = reduce(a, red_int1, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceInteger1Dim
+
+subroutine integer2dim(a, id)
+  integer(2), intent(in) :: a(:,:)
+  integer(2), allocatable :: res(:)
+
+  res = reduce(a, red_int2, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceInteger2Dim
+
+subroutine integer4dim(a, id)
+  integer(4), intent(in) :: a(:,:)
+  integer(4), allocatable :: res(:)
+
+  res = reduce(a, red_int4, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceInteger4Dim
+
+subroutine integer8dim(a, id)
+  integer(8), intent(in) :: a(:,:)
+  integer(8), allocatable :: res(:)
+
+  res = reduce(a, red_int8, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceInteger8Dim
+
+subroutine integer16dim(a, id)
+  integer(16), intent(in) :: a(:,:)
+  integer(16), allocatable :: res(:)
+
+  res = reduce(a, red_int16, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceInteger16Dim
+
+subroutine real2dim(a, id)
+  real(2), intent(in) :: a(:,:)
+  real(2), allocatable :: res(:)
+
+  res = reduce(a, red_real2, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceReal2Dim
+
+subroutine real3dim(a, id)
+  real(3), intent(in) :: a(:,:)
+  real(3), allocatable :: res(:)
+
+  res = reduce(a, red_real3, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceReal3Dim
+
+subroutine real4dim(a, id)
+  real(4), intent(in) :: a(:,:)
+  real(4), allocatable :: res(:)
+
+  res = reduce(a, red_real4, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceReal4Dim
+
+subroutine real8dim(a, id)
+  real(8), intent(in) :: a(:,:)
+  real(8), allocatable :: res(:)
+
+  res = reduce(a, red_real8, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceReal8Dim
+
+subroutine real10dim(a, id)
+  real(10), intent(in) :: a(:,:)
+  real(10), allocatable :: res(:)
+
+  res = reduce(a, red_real10, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceReal10Dim
+
+subroutine real16dim(a, id)
+  real(16), intent(in) :: a(:,:)
+  real(16), allocatable :: res(:)
+
+  res = reduce(a, red_real16, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceReal16Dim
+
+subroutine complex2dim(a, id)
+  complex(2), intent(in) :: a(:,:)
+  complex(2), allocatable :: res(:)
+
+  res = reduce(a, red_complex2, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranACppReduceComplex2Dim
+
+subroutine complex3dim(a, id)
+  complex(3), intent(in) :: a(:,:)
+  complex(3), allocatable :: res(:)
+
+  res = reduce(a, red_complex3, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranACppReduceComplex3Dim
+
+subroutine complex4dim(a, id)
+  complex(4), intent(in) :: a(:,:)
+  complex(4), allocatable :: res(:)
+
+  res = reduce(a, red_complex4, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranACppReduceComplex4Dim
+
+subroutine complex8dim(a, id)
+  complex(8), intent(in) :: a(:,:)
+  complex(8), allocatable :: res(:)
+
+  res = reduce(a, red_complex8, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranACppReduceComplex8Dim
+
+subroutine complex10dim(a, id)
+  complex(10), intent(in) :: a(:,:)
+  complex(10), allocatable :: res(:)
+
+  res = reduce(a, red_complex10, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranACppReduceComplex10Dim
+
+subroutine complex16dim(a, id)
+  complex(16), intent(in) :: a(:,:)
+  complex(16), allocatable :: res(:)
+
+  res = reduce(a, red_complex16, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranACppReduceComplex16Dim
+
+subroutine logical1dim(a, id)
+  logical(1), intent(in) :: a(:,:)
+  logical(1), allocatable :: res(:)
+
+  res = reduce(a, red_log1, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceLogical1Dim
+
+subroutine logical2dim(a, id)
+  logical(2), intent(in) :: a(:,:)
+  logical(2), allocatable :: res(:)
+
+  res = reduce(a, red_log2, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceLogical2Dim
+
+subroutine logical4dim(a, id)
+  logical(4), intent(in) :: a(:,:)
+  logical(4), allocatable :: res(:)
+
+  res = reduce(a, red_log4, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceLogical4Dim
+
+subroutine logical8dim(a, id)
+  logical(8), intent(in) :: a(:,:)
+  logical(8), allocatable :: res(:)
+
+  res = reduce(a, red_log8, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceLogical8Dim
+
+subroutine testtypeDim(a)
+  type(t1), intent(in) :: a(:,:)
+  type(t1), allocatable :: res(:)
+  res = reduce(a, red_type, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceDerivedTypeDim
+
+subroutine char1dim(a)
+  character(1), intent(in) :: a(:, :)
+  character(1), allocatable :: res(:)
+  res = reduce(a, red_char1, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceCharacter1Dim
+
+subroutine char2dim(a)
+  character(kind=2), intent(in) :: a(:, :)
+  character(kind=2), allocatable :: res(:)
+  res = reduce(a, red_char2, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceCharacter2Dim
+
+subroutine char4dim(a)
+  character(kind=4), intent(in) :: a(:, :)
+  character(kind=4), allocatable :: res(:)
+  res = reduce(a, red_char4, 2)
+end subroutine
+
+! CHECK: fir.call @_FortranAReduceCharacter4Dim
+
 end module
diff --git a/flang/test/Transforms/debug-90683.fir b/flang/test/Transforms/debug-90683.fir
index 9da0e5347d3f8f..cc6929c10411f8 100644
--- a/flang/test/Transforms/debug-90683.fir
+++ b/flang/test/Transforms/debug-90683.fir
@@ -2,7 +2,7 @@
 
 // This test checks that debug information for fir.real type works ok.
 
-module attributes {} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   func.func @_QPfn1(%arg0: !fir.ref<!fir.complex<8>> {fir.bindc_name = "a"} ) {
     %0 = fir.declare %arg0 {uniq_name = "_QFfn1Ea"} : (!fir.ref<!fir.complex<8>>) -> !fir.ref<!fir.complex<8>>
     %1 = fir.alloca f32 {bindc_name = "abserror", uniq_name = "_QFfn1Eabserror"}
diff --git a/flang/test/Transforms/debug-assumed-shape-array.fir b/flang/test/Transforms/debug-assumed-shape-array.fir
new file mode 100644
index 00000000000000..00dec9b318c811
--- /dev/null
+++ b/flang/test/Transforms/debug-assumed-shape-array.fir
@@ -0,0 +1,16 @@
+// RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>, #dlti.dl_entry<"dlti.endianness", "little">>, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"} {
+  func.func @ff_(%arg0: !fir.box<!fir.array<?x?xi32>> {fir.bindc_name = "arr"} ) {
+    %0 = fir.undefined !fir.dscope
+    %1 = fircg.ext_declare %arg0 dummy_scope %0 {uniq_name = "_QFffEarr"} : (!fir.box<!fir.array<?x?xi32>>, !fir.dscope) -> !fir.box<!fir.array<?x?xi32>> loc(#loc1)
+    return
+  } loc(#loc2)
+}
+#loc1 = loc("test1.f90":1:1)
+#loc2 = loc("test1.f90":3:16)
+
+// CHECK: #llvm.di_composite_type<tag = DW_TAG_array_type
+// CHECK-SAME: elements = #llvm.di_subrange<lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(24), DW_OP_deref]>, upperBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(32), DW_OP_deref]>>
+// CHECK-SAME: #llvm.di_subrange<lowerBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(48), DW_OP_deref]>, upperBound = #llvm.di_expression<[DW_OP_push_object_address, DW_OP_plus_uconst(56), DW_OP_deref]>>
+// CHECK-SAME: dataLocation = <[DW_OP_push_object_address, DW_OP_deref]>>
diff --git a/flang/test/Transforms/debug-complex-1.fir b/flang/test/Transforms/debug-complex-1.fir
index a3cbd767d8a58e..cc742d3b183bbf 100644
--- a/flang/test/Transforms/debug-complex-1.fir
+++ b/flang/test/Transforms/debug-complex-1.fir
@@ -3,7 +3,7 @@
 // check conversion of complex type of different size. Both fir and mlir
 // variants are checked.
 
-module attributes {fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.target_triple = "native"} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   func.func @test1(%x : !fir.complex<4>) -> !fir.complex<8> {
   %1 = fir.convert %x : (!fir.complex<4>) -> !fir.complex<8>
   return %1 : !fir.complex<8>
diff --git a/flang/test/Transforms/debug-fixed-array-type.fir b/flang/test/Transforms/debug-fixed-array-type.fir
index 401c725411831e..d4ed0b97020898 100644
--- a/flang/test/Transforms/debug-fixed-array-type.fir
+++ b/flang/test/Transforms/debug-fixed-array-type.fir
@@ -1,6 +1,6 @@
 // RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
 
-module attributes {} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   func.func @_QQmain() attributes {fir.bindc_name = "mn"} {
     %c7 = arith.constant 7 : index
     %c8 = arith.constant 8 : index
diff --git a/flang/test/Transforms/debug-line-table-existing.fir b/flang/test/Transforms/debug-line-table-existing.fir
index 534278ebc972d3..0e006303c8a81d 100644
--- a/flang/test/Transforms/debug-line-table-existing.fir
+++ b/flang/test/Transforms/debug-line-table-existing.fir
@@ -3,7 +3,7 @@
 // REQUIRES: system-linux
 
 // Test that there are no changes to a function with existed fused loc debug
-module attributes {} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   func.func @_QPs1() {
     return loc(#loc1)
   } loc(#loc2)
diff --git a/flang/test/Transforms/debug-line-table-inc-file.fir b/flang/test/Transforms/debug-line-table-inc-file.fir
index 9370c138fd42ff..065039b59c5ae8 100644
--- a/flang/test/Transforms/debug-line-table-inc-file.fir
+++ b/flang/test/Transforms/debug-line-table-inc-file.fir
@@ -3,7 +3,7 @@
 // REQUIRES: system-linux
 
 // Test for included functions that have a different debug location than the current file
-module attributes {} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   func.func @_QPsinc() {
     return loc(#loc2)
   } loc(#loc1)
@@ -19,7 +19,7 @@ module attributes {} {
 #loc4 = loc("/home/user01/llvm-project/build_release/simple.f90":4:3)
 #loc5 = loc("/home/user01/llvm-project/build_release/simple.f90":5:1)
 
-// CHECK: module {
+// CHECK: module
 // CHECK:   func.func @_QPsinc() {
 // CHECK:   } loc(#[[FUSED_LOC_INC_FILE:.*]])
 // CHECK:   func.func @_QQmain() {
diff --git a/flang/test/Transforms/debug-line-table-inc-same-file.fir b/flang/test/Transforms/debug-line-table-inc-same-file.fir
index 4836f2e21dd9db..bcaf4497982310 100644
--- a/flang/test/Transforms/debug-line-table-inc-same-file.fir
+++ b/flang/test/Transforms/debug-line-table-inc-same-file.fir
@@ -4,7 +4,7 @@
 
 // Test that there is only one FileAttribute generated for multiple functions
 // in the same file.
-module attributes {} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   func.func @_QPs1() {
     return loc(#loc2)
   } loc(#loc1)
diff --git a/flang/test/Transforms/debug-line-table.fir b/flang/test/Transforms/debug-line-table.fir
index 8a72ca2a856a70..d6e54fd1ac467e 100644
--- a/flang/test/Transforms/debug-line-table.fir
+++ b/flang/test/Transforms/debug-line-table.fir
@@ -3,7 +3,7 @@
 // RUN: fir-opt --add-debug-info="debug-level=LineTablesOnly" --mlir-print-debuginfo %s | FileCheck %s --check-prefix=LINETABLE
 // RUN: fir-opt --add-debug-info="is-optimized=true" --mlir-print-debuginfo %s | FileCheck %s --check-prefix=OPT
 
-module attributes { fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", llvm.target_triple = "aarch64-unknown-linux-gnu"} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   func.func @_QPsb() {
     return loc(#loc_sb)
   } loc(#loc_sb)
diff --git a/flang/test/Transforms/debug-local-var-2.f90 b/flang/test/Transforms/debug-local-var-2.f90
index ee60a07cc4bee6..0fe1b81c27e61e 100644
--- a/flang/test/Transforms/debug-local-var-2.f90
+++ b/flang/test/Transforms/debug-local-var-2.f90
@@ -20,20 +20,20 @@
 
 ! CHECK-LABEL: define {{.*}}i64 @_QFPfn1
 ! CHECK-SAME: (ptr %[[ARG1:.*]], ptr %[[ARG2:.*]], ptr %[[ARG3:.*]])
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG1]], metadata ![[A1:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG2]], metadata ![[B1:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[ARG3]], metadata ![[C1:.*]], metadata !DIExpression())
 ! CHECK-DAG: %[[AL2:.*]] = alloca i64
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL2]], metadata ![[RES1:.*]], metadata !DIExpression())
 ! CHECK-LABEL: }
 
 ! CHECK-LABEL: define {{.*}}i32 @_QFPfn2
 ! CHECK-SAME: (ptr %[[FN2ARG1:.*]], ptr %[[FN2ARG2:.*]], ptr %[[FN2ARG3:.*]])
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression())
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG1]], metadata ![[A2:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG2]], metadata ![[B2:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[FN2ARG3]], metadata ![[C2:.*]], metadata !DIExpression())
 ! CHECK-DAG: %[[AL3:.*]] = alloca i32
-! CHECK-DAG: call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression())
+! CHECK-DAG: tail call void @llvm.dbg.declare(metadata ptr %[[AL3]], metadata ![[RES2:.*]], metadata !DIExpression())
 ! CHECK-LABEL: }
 
 program mn
diff --git a/flang/test/Transforms/debug-module-1.fir b/flang/test/Transforms/debug-module-1.fir
index 822ae01b99aa78..71457d32b15960 100644
--- a/flang/test/Transforms/debug-module-1.fir
+++ b/flang/test/Transforms/debug-module-1.fir
@@ -1,7 +1,7 @@
 // RUN: fir-opt --add-debug-info --mlir-print-debuginfo %s | FileCheck %s
 
 
-module attributes {} {
+module attributes {dlti.dl_spec = #dlti.dl_spec<>} {
   fir.global @_QMhelperEgli : i32 {
     %0 = fir.zero_bits i32
     fir.has_value %0 : i32
diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index afb2d6d91cba43..d4aa50a43d186d 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -392,6 +392,16 @@ else()
       libc.src.__support.CPP.span
       libc.src.__support.CPP.type_traits
   )
+  add_header_library(
+    freelist
+    HDRS
+      freelist.h
+    DEPENDS
+      libc.src.__support.fixedvector
+      libc.src.__support.CPP.cstddef
+      libc.src.__support.CPP.array
+      libc.src.__support.CPP.span
+  )
   add_entrypoint_external(
     malloc
   )
diff --git a/libc/src/stdlib/freelist.h b/libc/src/stdlib/freelist.h
new file mode 100644
index 00000000000000..c01ed6eddb7d46
--- /dev/null
+++ b/libc/src/stdlib/freelist.h
@@ -0,0 +1,198 @@
+//===-- Interface for freelist_malloc -------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STDLIB_FREELIST_H
+#define LLVM_LIBC_SRC_STDLIB_FREELIST_H
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/cstddef.h"
+#include "src/__support/CPP/span.h"
+#include "src/__support/fixedvector.h"
+
+namespace LIBC_NAMESPACE {
+
+using cpp::span;
+
+/// Basic [freelist](https://en.wikipedia.org/wiki/Free_list) implementation
+/// for an allocator. This implementation buckets by chunk size, with a list
+/// of user-provided buckets. Each bucket is a linked list of storage chunks.
+/// Because this freelist uses the added chunks themselves as list nodes, there
+/// is a lower bound of `sizeof(FreeList.FreeListNode)` bytes for chunks which
+/// can be added to this freelist. There is also an implicit bucket for
+/// "everything else", for chunks which do not fit into a bucket.
+///
+/// Each added chunk will be added to the smallest bucket under which it fits.
+/// If it does not fit into any user-provided bucket, it will be added to the
+/// default bucket.
+///
+/// As an example, assume that the `FreeList` is configured with buckets of
+/// sizes {64, 128, 256, and 512} bytes. The internal state may look like the
+/// following:
+///
+/// @code{.unparsed}
+/// bucket[0] (64B) --> chunk[12B] --> chunk[42B] --> chunk[64B] --> NULL
+/// bucket[1] (128B) --> chunk[65B] --> chunk[72B] --> NULL
+/// bucket[2] (256B) --> NULL
+/// bucket[3] (512B) --> chunk[312B] --> chunk[512B] --> chunk[416B] --> NULL
+/// bucket[4] (implicit) --> chunk[1024B] --> chunk[513B] --> NULL
+/// @endcode
+///
+/// Note that added chunks should be aligned to a 4-byte boundary.
+template <size_t NUM_BUCKETS = 6> class FreeList {
+public:
+  // Remove copy/move ctors
+  FreeList(const FreeList &other) = delete;
+  FreeList(FreeList &&other) = delete;
+  FreeList &operator=(const FreeList &other) = delete;
+  FreeList &operator=(FreeList &&other) = delete;
+
+  /// Adds a chunk to this freelist.
+  bool add_chunk(cpp::span<cpp::byte> chunk);
+
+  /// Finds an eligible chunk for an allocation of size `size`.
+  ///
+  /// @note This returns the first allocation possible within a given bucket;
+  /// It does not currently optimize for finding the smallest chunk.
+  ///
+  /// @returns
+  /// * On success - A span representing the chunk.
+  /// * On failure (e.g. there were no chunks available for that allocation) -
+  ///   A span with a size of 0.
+  cpp::span<cpp::byte> find_chunk(size_t size) const;
+
+  /// Removes a chunk from this freelist.
+  bool remove_chunk(cpp::span<cpp::byte> chunk);
+
+private:
+  // For a given size, find which index into chunks_ the node should be written
+  // to.
+  size_t find_chunk_ptr_for_size(size_t size, bool non_null) const;
+
+  struct FreeListNode {
+    FreeListNode *next;
+    size_t size;
+  };
+
+public:
+  explicit FreeList(cpp::array<size_t, NUM_BUCKETS> sizes)
+      : chunks_(NUM_BUCKETS + 1, 0), sizes_(sizes.begin(), sizes.end()) {}
+
+  FixedVector<FreeList::FreeListNode *, NUM_BUCKETS + 1> chunks_;
+  FixedVector<size_t, NUM_BUCKETS> sizes_;
+};
+
+template <size_t NUM_BUCKETS>
+bool FreeList<NUM_BUCKETS>::add_chunk(span<cpp::byte> chunk) {
+  // Check that the size is enough to actually store what we need
+  if (chunk.size() < sizeof(FreeListNode))
+    return false;
+
+  union {
+    FreeListNode *node;
+    cpp::byte *bytes;
+  } aliased;
+
+  aliased.bytes = chunk.data();
+
+  size_t chunk_ptr = find_chunk_ptr_for_size(chunk.size(), false);
+
+  // Add it to the correct list.
+  aliased.node->size = chunk.size();
+  aliased.node->next = chunks_[chunk_ptr];
+  chunks_[chunk_ptr] = aliased.node;
+
+  return true;
+}
+
+template <size_t NUM_BUCKETS>
+span<cpp::byte> FreeList<NUM_BUCKETS>::find_chunk(size_t size) const {
+  if (size == 0)
+    return span<cpp::byte>();
+
+  size_t chunk_ptr = find_chunk_ptr_for_size(size, true);
+
+  // Check that there's data. This catches the case where we run off the
+  // end of the array
+  if (chunks_[chunk_ptr] == nullptr)
+    return span<cpp::byte>();
+
+  // Now iterate up the buckets, walking each list to find a good candidate
+  for (size_t i = chunk_ptr; i < chunks_.size(); i++) {
+    union {
+      FreeListNode *node;
+      cpp::byte *data;
+    } aliased;
+    aliased.node = chunks_[static_cast<unsigned short>(i)];
+
+    while (aliased.node != nullptr) {
+      if (aliased.node->size >= size)
+        return span<cpp::byte>(aliased.data, aliased.node->size);
+
+      aliased.node = aliased.node->next;
+    }
+  }
+
+  // If we get here, we've checked every block in every bucket. There's
+  // nothing that can support this allocation.
+  return span<cpp::byte>();
+}
+
+template <size_t NUM_BUCKETS>
+bool FreeList<NUM_BUCKETS>::remove_chunk(span<cpp::byte> chunk) {
+  size_t chunk_ptr = find_chunk_ptr_for_size(chunk.size(), true);
+
+  // Walk that list, finding the chunk.
+  union {
+    FreeListNode *node;
+    cpp::byte *data;
+  } aliased, aliased_next;
+
+  // Check head first.
+  if (chunks_[chunk_ptr] == nullptr)
+    return false;
+
+  aliased.node = chunks_[chunk_ptr];
+  if (aliased.data == chunk.data()) {
+    chunks_[chunk_ptr] = aliased.node->next;
+    return true;
+  }
+
+  // No? Walk the nodes.
+  aliased.node = chunks_[chunk_ptr];
+
+  while (aliased.node->next != nullptr) {
+    aliased_next.node = aliased.node->next;
+    if (aliased_next.data == chunk.data()) {
+      // Found it, remove this node out of the chain
+      aliased.node->next = aliased_next.node->next;
+      return true;
+    }
+
+    aliased.node = aliased.node->next;
+  }
+
+  return false;
+}
+
+template <size_t NUM_BUCKETS>
+size_t FreeList<NUM_BUCKETS>::find_chunk_ptr_for_size(size_t size,
+                                                      bool non_null) const {
+  size_t chunk_ptr = 0;
+  for (chunk_ptr = 0u; chunk_ptr < sizes_.size(); chunk_ptr++) {
+    if (sizes_[chunk_ptr] >= size &&
+        (!non_null || chunks_[chunk_ptr] != nullptr)) {
+      break;
+    }
+  }
+
+  return chunk_ptr;
+}
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_STDLIB_FREELIST_H
diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index f122cd56a60605..d3954f077a219f 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -67,6 +67,18 @@ add_libc_test(
     libc.src.string.memcpy
 )
 
+add_libc_test(
+  freelist_test
+  SUITE
+    libc-stdlib-tests
+  SRCS
+    freelist_test.cpp
+  DEPENDS
+    libc.src.stdlib.freelist
+    libc.src.__support.CPP.array
+    libc.src.__support.CPP.span
+)
+
 add_fp_unittest(
   strtod_test
   SUITE
diff --git a/libc/test/src/stdlib/freelist_test.cpp b/libc/test/src/stdlib/freelist_test.cpp
new file mode 100644
index 00000000000000..e25c74b47b8522
--- /dev/null
+++ b/libc/test/src/stdlib/freelist_test.cpp
@@ -0,0 +1,166 @@
+//===-- Unittests for a freelist --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <stddef.h>
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/span.h"
+#include "src/stdlib/freelist.h"
+#include "test/UnitTest/Test.h"
+
+using LIBC_NAMESPACE::FreeList;
+using LIBC_NAMESPACE::cpp::array;
+using LIBC_NAMESPACE::cpp::byte;
+using LIBC_NAMESPACE::cpp::span;
+
+static constexpr size_t SIZE = 8;
+static constexpr array<size_t, SIZE> example_sizes = {64,   128,  256,  512,
+                                                      1024, 2048, 4096, 8192};
+
+TEST(LlvmLibcFreeList, EmptyListHasNoMembers) {
+  FreeList<SIZE> list(example_sizes);
+
+  auto item = list.find_chunk(4);
+  EXPECT_EQ(item.size(), static_cast<size_t>(0));
+  item = list.find_chunk(128);
+  EXPECT_EQ(item.size(), static_cast<size_t>(0));
+}
+
+TEST(LlvmLibcFreeList, CanRetrieveAddedMember) {
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t N = 512;
+
+  byte data[N] = {byte(0)};
+
+  bool ok = list.add_chunk(span<byte>(data, N));
+  EXPECT_TRUE(ok);
+
+  auto item = list.find_chunk(N);
+  EXPECT_EQ(item.size(), N);
+  EXPECT_EQ(item.data(), data);
+}
+
+TEST(LlvmLibcFreeList, CanRetrieveAddedMemberForSmallerSize) {
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t N = 512;
+
+  byte data[N] = {byte(0)};
+
+  ASSERT_TRUE(list.add_chunk(span<byte>(data, N)));
+  auto item = list.find_chunk(N / 2);
+  EXPECT_EQ(item.size(), N);
+  EXPECT_EQ(item.data(), data);
+}
+
+TEST(LlvmLibcFreeList, CanRemoveItem) {
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t N = 512;
+
+  byte data[N] = {byte(0)};
+
+  ASSERT_TRUE(list.add_chunk(span<byte>(data, N)));
+  EXPECT_TRUE(list.remove_chunk(span<byte>(data, N)));
+
+  auto item = list.find_chunk(N);
+  EXPECT_EQ(item.size(), static_cast<size_t>(0));
+}
+
+TEST(LlvmLibcFreeList, FindReturnsSmallestChunk) {
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t kN1 = 512;
+  constexpr size_t kN2 = 1024;
+
+  byte data1[kN1] = {byte(0)};
+  byte data2[kN2] = {byte(0)};
+
+  ASSERT_TRUE(list.add_chunk(span<byte>(data1, kN1)));
+  ASSERT_TRUE(list.add_chunk(span<byte>(data2, kN2)));
+
+  auto chunk = list.find_chunk(kN1 / 2);
+  EXPECT_EQ(chunk.size(), kN1);
+  EXPECT_EQ(chunk.data(), data1);
+
+  chunk = list.find_chunk(kN1);
+  EXPECT_EQ(chunk.size(), kN1);
+  EXPECT_EQ(chunk.data(), data1);
+
+  chunk = list.find_chunk(kN1 + 1);
+  EXPECT_EQ(chunk.size(), kN2);
+  EXPECT_EQ(chunk.data(), data2);
+}
+
+TEST(LlvmLibcFreeList, FindReturnsCorrectChunkInSameBucket) {
+  // If we have two values in the same bucket, ensure that the allocation will
+  // pick an appropriately sized one.
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t kN1 = 512;
+  constexpr size_t kN2 = 257;
+
+  byte data1[kN1] = {byte(0)};
+  byte data2[kN2] = {byte(0)};
+
+  // List should now be 257 -> 512 -> NULL
+  ASSERT_TRUE(list.add_chunk(span<byte>(data1, kN1)));
+  ASSERT_TRUE(list.add_chunk(span<byte>(data2, kN2)));
+
+  auto chunk = list.find_chunk(kN2 + 1);
+  EXPECT_EQ(chunk.size(), kN1);
+}
+
+TEST(LlvmLibcFreeList, FindCanMoveUpThroughBuckets) {
+  // Ensure that finding a chunk will move up through buckets if no appropriate
+  // chunks were found in a given bucket
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t kN1 = 257;
+  constexpr size_t kN2 = 513;
+
+  byte data1[kN1] = {byte(0)};
+  byte data2[kN2] = {byte(0)};
+
+  // List should now be:
+  // bkt[3] (257 bytes up to 512 bytes) -> 257 -> NULL
+  // bkt[4] (513 bytes up to 1024 bytes) -> 513 -> NULL
+  ASSERT_TRUE(list.add_chunk(span<byte>(data1, kN1)));
+  ASSERT_TRUE(list.add_chunk(span<byte>(data2, kN2)));
+
+  // Request a 300 byte chunk. This should return the 513 byte one
+  auto chunk = list.find_chunk(kN1 + 1);
+  EXPECT_EQ(chunk.size(), kN2);
+}
+
+TEST(LlvmLibcFreeList, RemoveUnknownChunkReturnsNotFound) {
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t N = 512;
+
+  byte data[N] = {byte(0)};
+  byte data2[N] = {byte(0)};
+
+  ASSERT_TRUE(list.add_chunk(span<byte>(data, N)));
+  EXPECT_FALSE(list.remove_chunk(span<byte>(data2, N)));
+}
+
+TEST(LlvmLibcFreeList, CanStoreMultipleChunksPerBucket) {
+  FreeList<SIZE> list(example_sizes);
+  constexpr size_t N = 512;
+
+  byte data1[N] = {byte(0)};
+  byte data2[N] = {byte(0)};
+
+  ASSERT_TRUE(list.add_chunk(span<byte>(data1, N)));
+  ASSERT_TRUE(list.add_chunk(span<byte>(data2, N)));
+
+  auto chunk1 = list.find_chunk(N);
+  ASSERT_TRUE(list.remove_chunk(chunk1));
+  auto chunk2 = list.find_chunk(N);
+  ASSERT_TRUE(list.remove_chunk(chunk2));
+
+  // Ordering of the chunks doesn't matter
+  EXPECT_TRUE(chunk1.data() != chunk2.data());
+  EXPECT_TRUE(chunk1.data() == data1 || chunk1.data() == data2);
+  EXPECT_TRUE(chunk2.data() == data1 || chunk2.data() == data2);
+}
diff --git a/libcxx/include/__algorithm/simd_utils.h b/libcxx/include/__algorithm/simd_utils.h
index aa4336a2214c87..549197be80183f 100644
--- a/libcxx/include/__algorithm/simd_utils.h
+++ b/libcxx/include/__algorithm/simd_utils.h
@@ -11,6 +11,7 @@
 
 #include <__algorithm/min.h>
 #include <__bit/bit_cast.h>
+#include <__bit/countl.h>
 #include <__bit/countr.h>
 #include <__config>
 #include <__type_traits/is_arithmetic.h>
@@ -126,8 +127,13 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI size_t __find_first_set(__simd_vector<_T
 
   // This has MSan disabled du to https://github.com/llvm/llvm-project/issues/85876
   auto __impl = [&]<class _MaskT>(_MaskT) _LIBCPP_NO_SANITIZE("memory") noexcept {
+#  if defined(_LIBCPP_BIG_ENDIAN)
+    return std::min<size_t>(
+        _Np, std::__countl_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
+#  else
     return std::min<size_t>(
         _Np, std::__countr_zero(__builtin_bit_cast(_MaskT, __builtin_convertvector(__vec, __mask_vec))));
+#  endif
   };
 
   if constexpr (sizeof(__mask_vec) == sizeof(uint8_t)) {
diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 4dd23bb1e4dbec..2979bf69bf762a 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -5140,12 +5140,20 @@ uint32_t ObjectFileMachO::GetDependentModules(FileSpecList &files) {
       case LC_LOADFVMLIB:
       case LC_LOAD_UPWARD_DYLIB: {
         uint32_t name_offset = cmd_offset + m_data.GetU32(&offset);
+        // For LC_LOAD_DYLIB there is an alternate encoding
+        // which adds a uint32_t `flags` field for `DYLD_USE_*`
+        // flags.  This can be detected by a timestamp field with
+        // the `DYLIB_USE_MARKER` constant value.
         bool is_delayed_init = false;
         uint32_t use_command_marker = m_data.GetU32(&offset);
         if (use_command_marker == 0x1a741800 /* DYLIB_USE_MARKER */) {
           offset += 4; /* uint32_t current_version */
           offset += 4; /* uint32_t compat_version */
           uint32_t flags = m_data.GetU32(&offset);
+          // If this LC_LOAD_DYLIB is marked delay-init,
+          // don't report it as a dependent library -- it
+          // may be loaded in the process at some point,
+          // but will most likely not be load at launch.
           if (flags & 0x08 /* DYLIB_USE_DELAYED_INIT */)
             is_delayed_init = true;
         }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
index 90e42be7202d87..1d17f20670eed4 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DebugNamesDWARFIndex.cpp
@@ -85,6 +85,11 @@ bool DebugNamesDWARFIndex::ProcessEntry(
   DWARFDIE die = GetDIE(entry);
   if (!die)
     return true;
+  // Clang used to erroneously emit index entries for declaration DIEs in case
+  // when the definition is in a type unit (llvm.org/pr77696).
+  if (die.IsStructUnionOrClass() &&
+      die.GetAttributeValueAsUnsigned(DW_AT_declaration, 0))
+    return true;
   return callback(die);
 }
 
diff --git a/lldb/test/API/macosx/delay-init-dependency/Makefile b/lldb/test/API/macosx/delay-init-dependency/Makefile
new file mode 100644
index 00000000000000..246ea0f34e1a1c
--- /dev/null
+++ b/lldb/test/API/macosx/delay-init-dependency/Makefile
@@ -0,0 +1,11 @@
+C_SOURCES := main.c
+LD_EXTRAS := -L. -Wl,-delay_library,libfoo.dylib
+
+.PHONY: build-libfoo
+all: build-libfoo a.out
+
+include Makefile.rules
+
+build-libfoo: foo.c
+	$(MAKE) -f $(MAKEFILE_RULES) \
+		DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES
diff --git a/lldb/test/API/macosx/delay-init-dependency/TestDelayInitDependency.py b/lldb/test/API/macosx/delay-init-dependency/TestDelayInitDependency.py
new file mode 100644
index 00000000000000..44ed2b1d21f186
--- /dev/null
+++ b/lldb/test/API/macosx/delay-init-dependency/TestDelayInitDependency.py
@@ -0,0 +1,62 @@
+"""Test binaries with delay-init dependencies."""
+
+import subprocess
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class TestDelayInitDependencies(TestBase):
+    NO_DEBUG_INFO_TESTCASE = True
+
+    @skipUnlessDarwin
+    def test_delay_init_dependency(self):
+        TestBase.setUp(self)
+        out = subprocess.run(
+            ["xcrun", "ld", "-delay_library"],
+            universal_newlines=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        if "delay_library missing" not in out.stderr:
+            self.skipTest(
+                "Skipped because the linker doesn't know about -delay_library"
+            )
+        self.build()
+        main_source = "main.c"
+        exe = self.getBuildArtifact("a.out")
+        lib = self.getBuildArtifact("libfoo.dylib")
+
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+
+        # libfoo.dylib should not be in the target pre-execution
+        for m in target.modules:
+            self.assertNotEqual(m.GetFileSpec().GetFilename(), "libfoo.dylib")
+
+        # This run without arguments will not load libfoo.dylib
+        li = lldb.SBLaunchInfo([])
+        li.SetWorkingDirectory(self.getBuildDir())
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.c"), li
+        )
+        for m in target.modules:
+            self.assertNotEqual(m.GetFileSpec().GetFilename(), "libfoo.dylib")
+
+        process.Kill()
+        self.dbg.DeleteTarget(target)
+
+        # This run with one argument will load libfoo.dylib
+        li = lldb.SBLaunchInfo([])
+        li.SetWorkingDirectory(self.getBuildDir())
+        li.SetArguments(["one-argument"], True)
+        (target, process, thread, bkpt) = lldbutil.run_to_source_breakpoint(
+            self, "// break here", lldb.SBFileSpec("main.c"), li
+        )
+
+        found_libfoo = False
+        for m in target.modules:
+            if m.GetFileSpec().GetFilename() == "libfoo.dylib":
+                found_libfoo = True
+        self.assertTrue(found_libfoo)
diff --git a/lldb/test/API/macosx/delay-init-dependency/foo.c b/lldb/test/API/macosx/delay-init-dependency/foo.c
new file mode 100644
index 00000000000000..de1cbc4c4648a1
--- /dev/null
+++ b/lldb/test/API/macosx/delay-init-dependency/foo.c
@@ -0,0 +1 @@
+int foo() { return 5; }
diff --git a/lldb/test/API/macosx/delay-init-dependency/main.c b/lldb/test/API/macosx/delay-init-dependency/main.c
new file mode 100644
index 00000000000000..57d251e6b2abe4
--- /dev/null
+++ b/lldb/test/API/macosx/delay-init-dependency/main.c
@@ -0,0 +1,9 @@
+int foo();
+int main(int argc, char **argv) {
+  int retval = 0;
+  // Only call foo() if one argument is passed
+  if (argc == 2)
+    retval = foo();
+
+  return retval; // break here
+}
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-signature.s b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-signature.s
new file mode 100644
index 00000000000000..7b845a72bbed46
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-signature.s
@@ -0,0 +1,265 @@
+## Test that we can correctly complete types even if the debug_names index
+## contains entries referring to declaration dies (clang emitted entries like
+## that until bd5c6367bd7).
+##
+## This test consists of two compile units and one type unit. CU1 has the
+## definition of a variable, but only a forward-declaration of its type. When
+## attempting to find a definition, the debug_names lookup will return the DIE
+## in CU0, which is also a forward-declaration (with a reference to a type
+## unit). LLDB needs to find the definition of the type within the type unit.
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
+# RUN: %lldb %t -o "target variable s" -o exit | FileCheck %s
+
+# CHECK:      (lldb) target variable s
+# CHECK-NEXT: (Struct) s = (member = 47)
+
+        .data
+        .p2align        2, 0x0
+        .long   0
+s:
+        .long   47                              # 0x2f
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                               # Abbreviation Code
+        .byte   65                              # DW_TAG_type_unit
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   19                              # DW_AT_language
+        .byte   5                               # DW_FORM_data2
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   2                               # Abbreviation Code
+        .byte   19                              # DW_TAG_structure_type
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   54                              # DW_AT_calling_convention
+        .byte   11                              # DW_FORM_data1
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   11                              # DW_AT_byte_size
+        .byte   11                              # DW_FORM_data1
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   3                               # Abbreviation Code
+        .byte   13                              # DW_TAG_member
+        .byte   0                               # DW_CHILDREN_no
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   73                              # DW_AT_type
+        .byte   19                              # DW_FORM_ref4
+        .byte   56                              # DW_AT_data_member_location
+        .byte   11                              # DW_FORM_data1
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   4                               # Abbreviation Code
+        .byte   36                              # DW_TAG_base_type
+        .byte   0                               # DW_CHILDREN_no
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   62                              # DW_AT_encoding
+        .byte   11                              # DW_FORM_data1
+        .byte   11                              # DW_AT_byte_size
+        .byte   11                              # DW_FORM_data1
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   5                               # Abbreviation Code
+        .byte   17                              # DW_TAG_compile_unit
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   37                              # DW_AT_producer
+        .byte   8                               # DW_FORM_string
+        .byte   19                              # DW_AT_language
+        .byte   5                               # DW_FORM_data2
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   6                               # Abbreviation Code
+        .byte   52                              # DW_TAG_variable
+        .byte   0                               # DW_CHILDREN_no
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   73                              # DW_AT_type
+        .byte   19                              # DW_FORM_ref4
+        .byte   2                               # DW_AT_location
+        .byte   24                              # DW_FORM_exprloc
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   7                               # Abbreviation Code
+        .byte   19                              # DW_TAG_structure_type
+        .byte   0                               # DW_CHILDREN_no
+        .byte   60                              # DW_AT_declaration
+        .byte   25                              # DW_FORM_flag_present
+        .byte   105                             # DW_AT_signature
+        .byte   32                              # DW_FORM_ref_sig8
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   8                               # Abbreviation Code
+        .byte   19                              # DW_TAG_structure_type
+        .byte   0                               # DW_CHILDREN_no
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   60                              # DW_AT_declaration
+        .byte   25                              # DW_FORM_flag_present
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   0                               # EOM(3)
+
+        .section        .debug_info,"",@progbits
+.Ltu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                               # DWARF version number
+        .byte   2                               # DWARF Unit Type
+        .byte   8                               # Address Size (in bytes)
+        .long   .debug_abbrev                   # Offset Into Abbrev. Section
+        .quad   4878254330033667422             # Type Signature
+        .long   .LStruct_def-.Ltu_begin0        # Type DIE Offset
+        .byte   1                               # Abbrev [1] 0x18:0x20 DW_TAG_type_unit
+        .short  33                              # DW_AT_language
+.LStruct_def:
+        .byte   2                               # Abbrev [2] 0x23:0x10 DW_TAG_structure_type
+        .byte   5                               # DW_AT_calling_convention
+        .long   .Linfo_string6                  # DW_AT_name
+        .byte   4                               # DW_AT_byte_size
+        .byte   3                               # Abbrev [3] 0x29:0x9 DW_TAG_member
+        .long   .Linfo_string4                  # DW_AT_name
+        .long   .Lint-.Ltu_begin0               # DW_AT_type
+        .byte   0                               # DW_AT_data_member_location
+        .byte   0                               # End Of Children Mark
+.Lint:
+        .byte   4                               # Abbrev [4] 0x33:0x4 DW_TAG_base_type
+        .long   .Linfo_string5                  # DW_AT_name
+        .byte   5                               # DW_AT_encoding
+        .byte   4                               # DW_AT_byte_size
+        .byte   0                               # End Of Children Mark
+.Ldebug_info_end0:
+
+.Lcu_begin0:
+        .long   .Ldebug_info_end1-.Ldebug_info_start1 # Length of Unit
+.Ldebug_info_start1:
+        .short  5                               # DWARF version number
+        .byte   1                               # DWARF Unit Type
+        .byte   8                               # Address Size (in bytes)
+        .long   .debug_abbrev                   # Offset Into Abbrev. Section
+        .byte   5                               # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"            # DW_AT_producer
+        .short  33                              # DW_AT_language
+.Ls:
+        .byte   6                               # Abbrev [6] 0x1e:0xb DW_TAG_variable
+        .long   .Linfo_string3                  # DW_AT_name
+        .long   .LStruct_decl2-.Lcu_begin0       # DW_AT_type
+        .byte   9                               # DW_AT_location
+        .byte   3
+        .quad   s
+.LStruct_decl2:
+        .byte   8                               # Abbrev [8] 0x29:0x9 DW_TAG_structure_type
+        .long   .Linfo_string6                  # DW_AT_name
+                                                # DW_AT_declaration
+        .byte   0                               # End Of Children Mark
+.Ldebug_info_end1:
+
+.Lcu_begin1:
+        .long   .Ldebug_info_end2-.Ldebug_info_start2 # Length of Unit
+.Ldebug_info_start2:
+        .short  5                               # DWARF version number
+        .byte   1                               # DWARF Unit Type
+        .byte   8                               # Address Size (in bytes)
+        .long   .debug_abbrev                   # Offset Into Abbrev. Section
+        .byte   5                               # Abbrev [5] 0xc:0x27 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"            # DW_AT_producer
+        .short  33                              # DW_AT_language
+.LStruct_decl:
+        .byte   7                               # Abbrev [7] 0x29:0x9 DW_TAG_structure_type
+                                                # DW_AT_declaration
+        .quad   4878254330033667422             # DW_AT_signature
+        .byte   0                               # End Of Children Mark
+.Ldebug_info_end2:
+
+        .section        .debug_str,"MS",@progbits,1
+.Linfo_string3:
+        .asciz  "s"                             # string offset=60
+.Linfo_string4:
+        .asciz  "member"                        # string offset=62
+.Linfo_string5:
+        .asciz  "int"                           # string offset=69
+.Linfo_string6:
+        .asciz  "Struct"                        # string offset=73
+
+        .section        .debug_names,"",@progbits
+        .long   .Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+        .short  5                               # Header: version
+        .short  0                               # Header: padding
+        .long   2                               # Header: compilation unit count
+        .long   1                               # Header: local type unit count
+        .long   0                               # Header: foreign type unit count
+        .long   0                               # Header: bucket count
+        .long   3                               # Header: name count
+        .long   .Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+        .long   8                               # Header: augmentation string size
+        .ascii  "LLVM0700"                      # Header: augmentation string
+        .long   .Lcu_begin0                     # Compilation unit 0
+        .long   .Lcu_begin1                     # Compilation unit 1
+        .long   .Ltu_begin0                     # Type unit 0
+        .long   .Linfo_string6                  # String in Bucket 0: Struct
+        .long   .Linfo_string3                  # String in Bucket 1: s
+        .long   .Linfo_string5                  # String in Bucket 2: int
+        .long   .Lnames1-.Lnames_entries0       # Offset in Bucket 0
+        .long   .Lnames2-.Lnames_entries0       # Offset in Bucket 1
+        .long   .Lnames0-.Lnames_entries0       # Offset in Bucket 2
+.Lnames_abbrev_start0:
+        .byte   1                               # Abbrev code
+        .byte   19                              # DW_TAG_structure_type
+        .byte   2                               # DW_IDX_type_unit
+        .byte   11                              # DW_FORM_data1
+        .byte   3                               # DW_IDX_die_offset
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev
+        .byte   2                               # Abbrev code
+        .byte   52                              # DW_TAG_variable
+        .byte   1                               # DW_IDX_compile_unit
+        .byte   11                              # DW_FORM_data1
+        .byte   3                               # DW_IDX_die_offset
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev
+        .byte   3                               # Abbrev code
+        .byte   36                              # DW_TAG_base_type
+        .byte   2                               # DW_IDX_type_unit
+        .byte   11                              # DW_FORM_data1
+        .byte   3                               # DW_IDX_die_offset
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev
+        .byte   4                               # Abbrev code
+        .byte   19                              # DW_TAG_structure_type
+        .byte   1                               # DW_IDX_compile_unit
+        .byte   11                              # DW_FORM_data1
+        .byte   3                               # DW_IDX_die_offset
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames1:
+        .byte   4                               # Abbreviation code
+        .byte   1                               # DW_IDX_compile_unit
+        .long   .LStruct_decl-.Lcu_begin1       # DW_IDX_die_offset
+        .byte   1                               # Abbreviation code
+        .byte   0                               # DW_IDX_type_unit
+        .long   .LStruct_def-.Ltu_begin0        # DW_IDX_die_offset
+        .byte   0
+                                        # End of list: Struct
+.Lnames2:
+        .byte   2                               # Abbreviation code
+        .byte   0                               # DW_IDX_compile_unit
+        .long   .Ls-.Lcu_begin0                 # DW_IDX_die_offset
+        .byte   0
+                                        # End of list: s
+.Lnames0:
+        .byte   3                               # Abbreviation code
+        .byte   0                               # DW_IDX_type_unit
+        .long   .Lint-.Ltu_begin0               # DW_IDX_die_offset
+        .byte   0
+                                        # End of list: int
+        .p2align        2, 0x0
+.Lnames_end0:
diff --git a/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-static-constexpr-member.s b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-static-constexpr-member.s
new file mode 100644
index 00000000000000..9cb534207c3d13
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/debug-names-static-constexpr-member.s
@@ -0,0 +1,169 @@
+## Check that lldb can locate a static constant variable when its declaration is
+## referenced by a debug_names index. This is a non-conforming extension used by
+## dsymutil.
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
+# RUN: %lldb %t -o "target variable Class::constant" \
+# RUN:   -o "expr -l c++ -- Class::constant" -o exit | FileCheck %s
+
+# CHECK:      (lldb) target variable Class::constant
+# CHECK-NEXT: (const int) Class::constant = 47
+# CHECK:      (lldb) expr -l c++ -- Class::constant
+# CHECK-NEXT: (const int) $0 = 47
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                               # Abbreviation Code
+        .byte   17                              # DW_TAG_compile_unit
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   37                              # DW_AT_producer
+        .byte   8                               # DW_FORM_string
+        .byte   19                              # DW_AT_language
+        .byte   5                               # DW_FORM_data2
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   3                               # Abbreviation Code
+        .byte   2                               # DW_TAG_class_type
+        .byte   1                               # DW_CHILDREN_yes
+        .byte   54                              # DW_AT_calling_convention
+        .byte   11                              # DW_FORM_data1
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   11                              # DW_AT_byte_size
+        .byte   11                              # DW_FORM_data1
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   4                               # Abbreviation Code
+        .byte   52                              # DW_TAG_variable
+        .byte   0                               # DW_CHILDREN_no
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   73                              # DW_AT_type
+        .byte   19                              # DW_FORM_ref4
+        .byte   63                              # DW_AT_external
+        .byte   25                              # DW_FORM_flag_present
+        .byte   60                              # DW_AT_declaration
+        .byte   25                              # DW_FORM_flag_present
+        .byte   28                              # DW_AT_const_value
+        .byte   13                              # DW_FORM_sdata
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   5                               # Abbreviation Code
+        .byte   38                              # DW_TAG_const_type
+        .byte   0                               # DW_CHILDREN_no
+        .byte   73                              # DW_AT_type
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   6                               # Abbreviation Code
+        .byte   36                              # DW_TAG_base_type
+        .byte   0                               # DW_CHILDREN_no
+        .byte   3                               # DW_AT_name
+        .byte   14                              # DW_FORM_strp
+        .byte   62                              # DW_AT_encoding
+        .byte   11                              # DW_FORM_data1
+        .byte   11                              # DW_AT_byte_size
+        .byte   11                              # DW_FORM_data1
+        .byte   0                               # EOM(1)
+        .byte   0                               # EOM(2)
+        .byte   0                               # EOM(3)
+        .section        .debug_info,"",@progbits
+.Lcu_begin0:
+        .long   .Ldebug_info_end0-.Ldebug_info_start0 # Length of Unit
+.Ldebug_info_start0:
+        .short  5                               # DWARF version number
+        .byte   1                               # DWARF Unit Type
+        .byte   8                               # Address Size (in bytes)
+        .long   .debug_abbrev                   # Offset Into Abbrev. Section
+        .byte   1                               # Abbrev [1] 0xc:0x40 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"            # DW_AT_producer
+        .short  33                              # DW_AT_language
+.LClass:
+        .byte   3                               # Abbrev [3] 0x29:0x10 DW_TAG_class_type
+        .byte   5                               # DW_AT_calling_convention
+        .long   .Linfo_string4                  # DW_AT_name
+        .byte   1                               # DW_AT_byte_size
+.Lvariable:
+        .byte   4                               # Abbrev [4] 0x2f:0x9 DW_TAG_variable
+        .long   .Linfo_string5                  # DW_AT_name
+        .long   .Lconst_int-.Lcu_begin0         # DW_AT_type
+                                                # DW_AT_external
+                                                # DW_AT_declaration
+        .byte   47                              # DW_AT_const_value
+        .byte   0                               # End Of Children Mark
+.Lconst_int:
+        .byte   5                               # Abbrev [5] 0x39:0x5 DW_TAG_const_type
+        .long   .Lint-.Lcu_begin0               # DW_AT_type
+.Lint:
+        .byte   6                               # Abbrev [6] 0x3e:0x4 DW_TAG_base_type
+        .long   .Linfo_string6                  # DW_AT_name
+        .byte   5                               # DW_AT_encoding
+        .byte   4                               # DW_AT_byte_size
+        .byte   0                               # End Of Children Mark
+.Ldebug_info_end0:
+
+        .section        .debug_str,"MS",@progbits,1
+.Linfo_string4:
+        .asciz  "Class"
+.Linfo_string5:
+        .asciz  "constant"
+.Linfo_string6:
+        .asciz  "int"
+
+        .section        .debug_names,"",@progbits
+        .long   .Lnames_end0-.Lnames_start0     # Header: unit length
+.Lnames_start0:
+        .short  5                               # Header: version
+        .short  0                               # Header: padding
+        .long   1                               # Header: compilation unit count
+        .long   0                               # Header: local type unit count
+        .long   0                               # Header: foreign type unit count
+        .long   0                               # Header: bucket count
+        .long   3                               # Header: name count
+        .long   .Lnames_abbrev_end0-.Lnames_abbrev_start0 # Header: abbreviation table size
+        .long   8                               # Header: augmentation string size
+        .ascii  "LLVM0700"                      # Header: augmentation string
+        .long   .Lcu_begin0                     # Compilation unit 0
+        .long   .Linfo_string4                  # String: Class
+        .long   .Linfo_string5                  # String: constant
+        .long   .Linfo_string6                  # String: int
+        .long   .Lnames0-.Lnames_entries0
+        .long   .Lnames3-.Lnames_entries0
+        .long   .Lnames1-.Lnames_entries0
+.Lnames_abbrev_start0:
+        .byte   1                               # Abbrev code
+        .byte   2                               # DW_TAG_class_type
+        .byte   3                               # DW_IDX_die_offset
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev
+        .byte   2                               # Abbrev code
+        .byte   52                              # DW_TAG_variable
+        .byte   3                               # DW_IDX_die_offset
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev
+        .byte   3                               # Abbrev code
+        .byte   36                              # DW_TAG_base_type
+        .byte   3                               # DW_IDX_die_offset
+        .byte   19                              # DW_FORM_ref4
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev
+        .byte   0                               # End of abbrev list
+.Lnames_abbrev_end0:
+.Lnames_entries0:
+.Lnames0:
+        .byte   1                               # Abbreviation code
+        .long   .LClass-.Lcu_begin0             # DW_IDX_die_offset
+        .byte   0                               # DW_IDX_parent
+                                        # End of list: Class
+.Lnames3:
+        .byte   2                               # Abbreviation code
+        .long   .Lvariable-.Lcu_begin0          # DW_IDX_die_offset
+        .byte   0                               # DW_IDX_parent
+                                        # End of list: constant
+.Lnames1:
+        .byte   3                               # Abbreviation code
+        .long   .Lint-.Lcu_begin0               # DW_IDX_die_offset
+        .byte   0                               # DW_IDX_parent
+                                        # End of list: int
+.Lnames_end0:
diff --git a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp
index 277cc14ce50c9f..597e5b2e40d5e0 100644
--- a/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp
+++ b/lldb/unittests/UnwindAssembly/x86/Testx86AssemblyInspectionEngine.cpp
@@ -1731,7 +1731,29 @@ TEST_F(Testx86AssemblyInspectionEngine, TestAddESP) {
   EXPECT_EQ(4 - 16, row_sp->GetCFAValue().GetOffset());
 }
 
-// FIXME add test for lea_rsp_pattern_p
+TEST_F(Testx86AssemblyInspectionEngine, TestLEA_RSP_Pattern) {
+  UnwindPlan::Row::RegisterLocation regloc;
+  UnwindPlan::RowSP row_sp;
+  AddressRange sample_range;
+  UnwindPlan unwind_plan(eRegisterKindLLDB);
+  std::unique_ptr<x86AssemblyInspectionEngine> engine = Getx86_64Inspector();
+
+  uint8_t data[] = {
+      0x8d, 0x64, 0x24, 0x10, // lea rsp, [rsp + 0x10]
+      0x90                    // nop
+  };
+
+  sample_range = AddressRange(0x1000, sizeof(data));
+
+  EXPECT_TRUE(engine->GetNonCallSiteUnwindPlanFromAssembly(
+      data, sizeof(data), sample_range, unwind_plan));
+
+  row_sp = unwind_plan.GetRowForFunctionOffset(0);
+  EXPECT_EQ(0ull, row_sp->GetOffset());
+  EXPECT_TRUE(row_sp->GetCFAValue().GetRegisterNumber() == k_rsp);
+  EXPECT_TRUE(row_sp->GetCFAValue().IsRegisterPlusOffset() == true);
+  EXPECT_EQ(8, row_sp->GetCFAValue().GetOffset());
+}
 
 TEST_F(Testx86AssemblyInspectionEngine, TestPopRBX) {
   UnwindPlan::Row::RegisterLocation regloc;
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 5a165acf978d18..e1157c1f7e07fd 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -5940,6 +5940,33 @@ following sections:
 * :ref:`amdgpu-amdhsa-memory-model-gfx942`
 * :ref:`amdgpu-amdhsa-memory-model-gfx10-gfx11`
 
+.. _amdgpu-fence-as:
+
+Fence and Address Spaces
+++++++++++++++++++++++++++++++
+
+LLVM fences do not have address space information, thus, fence
+codegen usually needs to conservatively synchronize all address spaces.
+
+In the case of OpenCL, where fences only need to synchronize
+user-specified address spaces, this can result in extra unnecessary waits.
+For instance, a fence that is supposed to only synchronize local memory will
+also have to wait on all global memory operations, which is unnecessary.
+
+:doc:`Memory Model Relaxation Annotations <MemoryModelRelaxationAnnotations>` can
+be used as an optimization hint for fences to solve this problem.
+The AMDGPU backend recognizes the following tags on fences:
+
+- ``amdgpu-as:local`` - fence only the local address space
+- ``amdgpu-as:global``- fence only the global address space
+
+.. note::
+
+  As an optimization hint, those tags are not guaranteed to survive until
+  code generation. Optimizations are free to drop the tags to allow for
+  better code optimization, at the cost of synchronizing additional address
+  spaces.
+
 .. _amdgpu-amdhsa-memory-model-gfx6-gfx9:
 
 Memory Model GFX6-GFX9
@@ -6277,21 +6304,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                                            - If OpenCL and
                                                              address space is
                                                              not generic, omit.
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Must happen after
                                                              any preceding
                                                              local/generic load
@@ -6323,14 +6338,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -6533,21 +6543,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                                            - If OpenCL and
                                                              address space is
                                                              not generic, omit.
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Must happen after
                                                              any preceding
                                                              local/generic
@@ -6583,21 +6581,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -6927,14 +6913,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -7875,21 +7856,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - s_waitcnt vmcnt(0)
                                                              must happen after
                                                              any preceding
@@ -7948,14 +7917,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -8026,14 +7990,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -8401,21 +8360,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - s_waitcnt vmcnt(0)
                                                              must happen after
                                                              any preceding
@@ -8461,21 +8408,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -8543,21 +8478,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -9178,14 +9101,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -9287,14 +9205,9 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx90a-table`.
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -10250,21 +10163,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - s_waitcnt vmcnt(0)
                                                              must happen after
                                                              any preceding
@@ -10323,14 +10224,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -10401,14 +10297,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -10807,21 +10698,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - s_waitcnt vmcnt(0)
                                                              must happen after
                                                              any preceding
@@ -10880,21 +10759,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -10959,21 +10826,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              local, omit
                                                              vmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -11622,14 +11477,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -11731,14 +11581,9 @@ are defined in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx940-gfx9
                                                              address space is
                                                              not generic, omit
                                                              lgkmcnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0) and
@@ -12584,21 +12429,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0) and vscnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0), s_waitcnt
@@ -12681,14 +12514,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0) and vscnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0), s_waitcnt
@@ -13052,21 +12880,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0) and vscnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0), s_waitcnt
@@ -13125,21 +12941,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0) and vscnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate. If
-                                                             fence had an
-                                                             address space then
-                                                             set to address
-                                                             space of OpenCL
-                                                             fence flag, or to
-                                                             generic if both
-                                                             local and global
-                                                             flags are
-                                                             specified.
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0), s_waitcnt
@@ -13691,14 +13495,9 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-gfx11-table`.
                                                              address space is
                                                              local, omit
                                                              vmcnt(0) and vscnt(0).
-                                                           - However, since LLVM
-                                                             currently has no
-                                                             address space on
-                                                             the fence need to
-                                                             conservatively
-                                                             always generate
-                                                             (see comment for
-                                                             previous fence).
+                                                           - See :ref:`amdgpu-fence-as` for
+                                                             more details on fencing specific
+                                                             address spaces.
                                                            - Could be split into
                                                              separate s_waitcnt
                                                              vmcnt(0), s_waitcnt
diff --git a/llvm/docs/AssignmentTracking.md b/llvm/docs/AssignmentTracking.md
index 5a8bc5844eef6b..a24a8b0d797f87 100644
--- a/llvm/docs/AssignmentTracking.md
+++ b/llvm/docs/AssignmentTracking.md
@@ -11,7 +11,7 @@ The core idea is to track more information about source assignments in order
 and preserve enough information to be able to defer decisions about whether to
 use non-memory locations (register, constant) or memory locations until after
 middle end optimisations have run. This is in opposition to using
-`llvm.dbg.declare` and `llvm.dbg.value`, which is to make the decision for most
+`#dbg_declare` and `#dbg_value`, which is to make the decision for most
 variables early on, which can result in suboptimal variable locations that may
 be either incorrect or incomplete.
 
@@ -26,19 +26,18 @@ except for development and testing.
 **Enable in Clang**: `-Xclang -fexperimental-assignment-tracking`
 
 That causes Clang to get LLVM to run the pass `declare-to-assign`. The pass
-converts conventional debug intrinsics to assignment tracking metadata and sets
+converts conventional debug records to assignment tracking metadata and sets
 the module flag `debug-info-assignment-tracking` to the value `i1 true`. To
 check whether assignment tracking is enabled for a module call
 `isAssignmentTrackingEnabled(const Module &M)` (from `llvm/IR/DebugInfo.h`).
 
 ## Design and implementation
 
-### Assignment markers: `llvm.dbg.assign`
+### Assignment markers: `#dbg_assign`
 
-`llvm.dbg.value`, a conventional debug intrinsic, marks out a position in the
+`#dbg_value`, a conventional debug record, marks out a position in the
 IR where a variable takes a particular value. Similarly, Assignment Tracking
-marks out the position of assignments with a new intrinsic called
-`llvm.dbg.assign`.
+marks out the position of assignments with a record called `#dbg_assign`.
 
 In order to know where in IR it is appropriate to use a memory location for a
 variable, each assignment marker must in some way refer to the store, if any
@@ -48,24 +47,23 @@ important benefit of referring to the store is that we can then build a two-way
 mapping of stores<->markers that can be used to find markers that need to be
 updated when stores are modified.
 
-An `llvm.dbg.assign` marker that is not linked to any instruction signals that
+An `#dbg_assign` marker that is not linked to any instruction signals that
 the store that performed the assignment has been optimised out, and therefore
 the memory location will not be valid for at least some part of the program.
 
-Here's the `llvm.dbg.assign` signature. Each parameter is wrapped in
-`MetadataAsValue`, and `Value *` type parameters are first wrapped in
-`ValueAsMetadata`:
+Here's the `#dbg_assign` signature. `Value *` type parameters are first wrapped
+in `ValueAsMetadata`:
 
 ```
-void @llvm.dbg.assign(Value *Value,
-                      DIExpression *ValueExpression,
-                      DILocalVariable *Variable,
-                      DIAssignID *ID,
-                      Value *Address,
-                      DIExpression *AddressExpression)
+  #dbg_assign(Value *Value,
+              DIExpression *ValueExpression,
+              DILocalVariable *Variable,
+              DIAssignID *ID,
+              Value *Address,
+              DIExpression *AddressExpression)
 ```
 
-The first three parameters look and behave like an `llvm.dbg.value`. `ID` is a
+The first three parameters look and behave like an `#dbg_value`. `ID` is a
 reference to a store (see next section). `Address` is the destination address
 of the store and it is modified by `AddressExpression`. An empty/undef/poison
 address means the address component has been killed (the memory address is no
@@ -73,18 +71,13 @@ longer a valid location). LLVM currently encodes variable fragment information
 in `DIExpression`s, so as an implementation quirk the `FragmentInfo` for
 `Variable` is contained within `ValueExpression` only.
 
-The formal LLVM-IR signature is:
-```
-void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-```
-
 ### Instruction link: `DIAssignID`
 
 `DIAssignID` metadata is the mechanism that is currently used to encode the
 store<->marker link. The metadata node has no operands and all instances are
 `distinct`; equality is checked for by comparing addresses.
 
-`llvm.dbg.assign` intrinsics use a `DIAssignID` metadata node instance as an
+`#dbg_assign` records use a `DIAssignID` metadata node instance as an
 operand. This way it refers to any store-like instruction that has the same
 `DIAssignID` attachment. E.g. For this test.cpp,
 
@@ -102,9 +95,9 @@ we get:
 define dso_local noundef i32 @_Z3funi(i32 noundef %a) #0 !dbg !8 {
 entry:
   %a.addr = alloca i32, align 4, !DIAssignID !13
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata i32* %a.addr, metadata !DIExpression()), !dbg !15
+    #dbg_assign(i1 undef, !14, !DIExpression(), !13, i32* %a.addr, !DIExpression(), !15)
   store i32 %a, i32* %a.addr, align 4, !DIAssignID !16
-  call void @llvm.dbg.assign(metadata i32 %a, metadata !14, metadata !DIExpression(), metadata !16, metadata i32* %a.addr, metadata !DIExpression()), !dbg !15
+    #dbg_assign(i32 %a, !14, !DIExpression(), !16, i32* %a.addr, !DIExpression(), !15)
   %0 = load i32, i32* %a.addr, align 4, !dbg !17
   ret i32 %0, !dbg !18
 }
@@ -116,16 +109,16 @@ entry:
 !16 = distinct !DIAssignID()
 ```
 
-The first `llvm.dbg.assign` refers to the `alloca` through `!DIAssignID !13`,
+The first `#dbg_assign` refers to the `alloca` through `!DIAssignID !13`,
 and the second refers to the `store` through `!DIAssignID !16`.
 
 ### Store-like instructions
 
-In the absence of a linked `llvm.dbg.assign`, a store to an address that is
+In the absence of a linked `#dbg_assign`, a store to an address that is
 known to be the backing storage for a variable is considered to represent an
 assignment to that variable.
 
-This gives us a safe fall-back in cases where `llvm.dbg.assign` intrinsics have
+This gives us a safe fall-back in cases where `#dbg_assign` records have
 been deleted, the `DIAssignID` attachment on the store has been dropped, or the
 optimiser has made a once-indirect store (not tracked with Assignment Tracking)
 direct.
@@ -139,61 +132,61 @@ direct.
 instruction. In this case, the assignment is considered to take place in
 multiple positions in the program.
 
-**Moving** a non-debug instruction: nothing new to do. Instructions linked to an
-`llvm.dbg.assign` have their initial IR position marked by the position of the
-`llvm.dbg.assign`.
+**Moving** a non-debug instruction: nothing new to do. Instructions linked to a
+`#dbg_assign` have their initial IR position marked by the position of the
+`#dbg_assign`.
 
 **Deleting** a non-debug instruction: nothing new to do. Simple DSE does not
 require any change; it’s safe to delete an instruction with a `DIAssignID`
-attachment. An `llvm.dbg.assign` that uses a `DIAssignID` that is not attached
+attachment. A `#dbg_assign` that uses a `DIAssignID` that is not attached
 to any instruction indicates that the memory location isn’t valid.
 
 **Merging** stores: In many cases no change is required as `DIAssignID`
 attachments are automatically merged if `combineMetadata` is called. One way or
 another, the `DIAssignID` attachments must be merged such that new store
-becomes linked to all the `llvm.dbg.assign` intrinsics that the merged stores
+becomes linked to all the `#dbg_assign` records that the merged stores
 were linked to. This can be achieved simply by calling a helper function
 `Instruction::mergeDIAssignID`.
 
-**Inlining** stores: As stores are inlined we generate `llvm.dbg.assign`
-intrinsics and `DIAssignID` attachments as if the stores represent source
+**Inlining** stores: As stores are inlined we generate `#dbg_assign`
+records and `DIAssignID` attachments as if the stores represent source
 assignments, just like the in frontend. This isn’t perfect, as stores may have
 been moved, modified or deleted before inlining, but it does at least keep the
 information about the variable correct within the non-inlined scope.
 
-**Splitting** stores: SROA and passes that split stores treat `llvm.dbg.assign`
-intrinsics similarly to `llvm.dbg.declare` intrinsics. Clone the
-`llvm.dbg.assign` intrinsics linked to the store, update the FragmentInfo in
-the `ValueExpression`, and give the split stores (and cloned intrinsics) new
+**Splitting** stores: SROA and passes that split stores treat `#dbg_assign`
+records similarly to `#dbg_declare` records. Clone the
+`#dbg_assign` records linked to the store, update the FragmentInfo in
+the `ValueExpression`, and give the split stores (and cloned records) new
 `DIAssignID` attachments each. In other words, treat the split stores as
 separate assignments. For partial DSE (e.g. shortening a memset), we do the
-same except that `llvm.dbg.assign` for the dead fragment gets an `Undef`
+same except that `#dbg_assign` for the dead fragment gets an `Undef`
 `Address`.
 
-**Promoting** allocas and store/loads: `llvm.dbg.assign` intrinsics implicitly
+**Promoting** allocas and store/loads: `#dbg_assign` records implicitly
 describe joined values in memory locations at CFG joins, but this is not
 necessarily the case after promoting (or partially promoting) the
 variable. Passes that promote variables are responsible for inserting
-`llvm.dbg.assign` intrinsics after the resultant PHIs generated during
-promotion. `mem2reg` already has to do this (with `llvm.dbg.value`) for
-`llvm.dbg.declare`s. Where a store has no linked intrinsic, the store is
+`#dbg_assign` records after the resultant PHIs generated during
+promotion. `mem2reg` already has to do this (with `#dbg_value`) for
+`#dbg_declare`s. Where a store has no linked record, the store is
 assumed to represent an assignment for variables stored at the destination
 address.
 
-#### Debug intrinsic updates
+#### Debug record updates
 
-**Moving** a debug intrinsic: avoid moving `llvm.dbg.assign` intrinsics where
+**Moving** a debug record: avoid moving `#dbg_assign` records where
 possible, as they represent a source-level assignment, whose position in the
 program should not be affected by optimization passes.
 
-**Deleting** a debug intrinsic: Nothing new to do. Just like for conventional
-debug intrinsics, unless it is unreachable, it’s almost always incorrect to
-delete a `llvm.dbg.assign` intrinsic.
+**Deleting** a debug record: Nothing new to do. Just like for conventional
+debug records, unless it is unreachable, it’s almost always incorrect to
+delete a `#dbg_assign` record.
 
-### Lowering `llvm.dbg.assign` to MIR
+### Lowering `#dbg_assign` to MIR
 
-To begin with only SelectionDAG ISel will be supported. `llvm.dbg.assign`
-intrinsics are lowered to MIR `DBG_INSTR_REF` instructions. Before this happens
+To begin with only SelectionDAG ISel will be supported. `#dbg_assign`
+records are lowered to MIR `DBG_INSTR_REF` instructions. Before this happens
 we need to decide where it is appropriate to use memory locations and where we
 must use a non-memory location (or no location) for each variable. In order to
 make those decisions we run a standard fixed-point dataflow analysis that makes
@@ -214,9 +207,9 @@ to tackle:
   clang/test/CodeGen/assignment-tracking/assignment-tracking.cpp for examples.
 
 * `trackAssignments` doesn't yet work for variables that have their
-  `llvm.dbg.declare` location modified by a `DIExpression`, e.g. when the
+  `#dbg_declare` location modified by a `DIExpression`, e.g. when the
   address of the variable is itself stored in an `alloca` with the
-  `llvm.dbg.declare` using `DIExpression(DW_OP_deref)`. See `indirectReturn` in
+  `#dbg_declare` using `DIExpression(DW_OP_deref)`. See `indirectReturn` in
   llvm/test/DebugInfo/Generic/assignment-tracking/track-assignments.ll and in
   clang/test/CodeGen/assignment-tracking/assignment-tracking.cpp for an
   example.
@@ -225,13 +218,13 @@ to tackle:
   memory location is available without using a `DIAssignID`. This is because
   the storage address is not computed by an instruction (it's an argument
   value) and therefore we have nowhere to put the metadata attachment. To solve
-  this we probably need another marker intrinsic to denote "the variable's
-  stack home is X address" - similar to `llvm.dbg.declare` except that it needs
-  to compose with `llvm.dbg.assign` intrinsics such that the stack home address
-  is only selected as a location for the variable when the `llvm.dbg.assign`
-  intrinsics agree it should be.
+  this we probably need another marker record to denote "the variable's
+  stack home is X address" - similar to `#dbg_declare` except that it needs
+  to compose with `#dbg_assign` records such that the stack home address
+  is only selected as a location for the variable when the `#dbg_assign`
+  records agree it should be.
 
-* Given the above (a special "the stack home is X" intrinsic), and the fact
+* Given the above (a special "the stack home is X" record), and the fact
   that we can only track assignments with fixed offsets and sizes, I think we
   can probably get rid of the address and address-expression part, since it
   will always be computable with the info we have.
diff --git a/llvm/docs/BranchWeightMetadata.rst b/llvm/docs/BranchWeightMetadata.rst
index 62204753e29b06..522f37cdad4fc1 100644
--- a/llvm/docs/BranchWeightMetadata.rst
+++ b/llvm/docs/BranchWeightMetadata.rst
@@ -28,14 +28,11 @@ Supported Instructions
 
 Metadata is only assigned to the conditional branches. There are two extra
 operands for the true and the false branch.
-We optionally track if the metadata was added by ``__builtin_expect`` or
-``__builtin_expect_with_probability`` with an optional field ``!"expected"``.
 
 .. code-block:: none
 
   !0 = !{
     !"branch_weights",
-    [ !"expected", ]
     i32 <TRUE_BRANCH_WEIGHT>,
     i32 <FALSE_BRANCH_WEIGHT>
   }
@@ -50,7 +47,6 @@ is always case #0).
 
   !0 = !{
     !"branch_weights",
-    [ !"expected", ]
     i32 <DEFAULT_BRANCH_WEIGHT>
     [ , i32 <CASE_BRANCH_WEIGHT> ... ]
   }
@@ -64,7 +60,6 @@ Branch weights are assigned to every destination.
 
   !0 = !{
     !"branch_weights",
-    [ !"expected", ]
     i32 <LABEL_BRANCH_WEIGHT>
     [ , i32 <LABEL_BRANCH_WEIGHT> ... ]
   }
@@ -80,7 +75,6 @@ block and entry counts which may not be accurate with sampling.
 
   !0 = !{
     !"branch_weights",
-    [ !"expected", ]
     i32 <CALL_BRANCH_WEIGHT>
   }
 
@@ -101,7 +95,6 @@ is used.
 
   !0 = !{
     !"branch_weights",
-    [ !"expected", ]
     i32 <INVOKE_NORMAL_WEIGHT>
     [ , i32 <INVOKE_UNWIND_WEIGHT> ]
   }
diff --git a/llvm/docs/HowToUpdateDebugInfo.rst b/llvm/docs/HowToUpdateDebugInfo.rst
index c64b5d1d0d98b6..db3465aa54d844 100644
--- a/llvm/docs/HowToUpdateDebugInfo.rst
+++ b/llvm/docs/HowToUpdateDebugInfo.rst
@@ -151,7 +151,7 @@ Deleting an IR-level Instruction
 
 When an ``Instruction`` is deleted, its debug uses change to ``undef``. This is
 a loss of debug info: the value of one or more source variables becomes
-unavailable, starting with the ``llvm.dbg.value(undef, ...)``. When there is no
+unavailable, starting with the ``#dbg_value(undef, ...)``. When there is no
 way to reconstitute the value of the lost instruction, this is the best
 possible outcome. However, it's often possible to do better:
 
@@ -172,7 +172,7 @@ possible outcome. However, it's often possible to do better:
   define i16 @foo(i16 %a) {
     %b = sext i16 %a to i32
     %c = and i32 %b, 15
-    call void @llvm.dbg.value(metadata i32 %c, ...)
+      #dbg_value(i32 %c, ...)
     %d = trunc i32 %c to i16
     ret i16 %d
   }
@@ -183,7 +183,7 @@ replaced with a simplified instruction:
 .. code-block:: llvm
 
   define i16 @foo(i16 %a) {
-    call void @llvm.dbg.value(metadata i32 undef, ...)
+      #dbg_value(i32 undef, ...)
     %simplified = and i16 %a, 15
     ret i16 %simplified
   }
@@ -204,7 +204,7 @@ This results in better debug info because the debug use of ``%c`` is preserved:
 
   define i16 @foo(i16 %a) {
     %simplified = and i16 %a, 15
-    call void @llvm.dbg.value(metadata i16 %simplified, ...)
+      #dbg_value(i16 %simplified, ...)
     ret i16 %simplified
   }
 
@@ -249,7 +249,7 @@ module, and the second checks that this DI is still available after an
 optimization has occurred, reporting any errors/warnings while doing so.
 
 The instructions are assigned sequentially increasing line locations, and are
-immediately used by debug value intrinsics everywhere possible.
+immediately used by debug value records everywhere possible.
 
 For example, here is a module before:
 
@@ -271,10 +271,10 @@ and after running ``opt -debugify``:
    define void @f(i32* %x) !dbg !6 {
    entry:
      %x.addr = alloca i32*, align 8, !dbg !12
-     call void @llvm.dbg.value(metadata i32** %x.addr, metadata !9, metadata !DIExpression()), !dbg !12
+       #dbg_value(i32** %x.addr, !9, !DIExpression(), !12)
      store i32* %x, i32** %x.addr, align 8, !dbg !13
      %0 = load i32*, i32** %x.addr, align 8, !dbg !14
-     call void @llvm.dbg.value(metadata i32* %0, metadata !11, metadata !DIExpression()), !dbg !14
+       #dbg_value(i32* %0, !11, !DIExpression(), !14)
      store i32 10, i32* %0, align 4, !dbg !15
      ret void, !dbg !16
    }
@@ -409,7 +409,7 @@ as follows:
   $ clang -Xclang -fverify-debuginfo-preserve -Xclang -fverify-debuginfo-preserve-export=sample.json -g -O2 sample.c
 
 Please do note that there are some known false positives, for source locations
-and debug intrinsic checking, so that will be addressed as a future work.
+and debug record checking, so that will be addressed as a future work.
 
 Mutation testing for MIR-level transformations
 ----------------------------------------------
diff --git a/llvm/docs/InstrRefDebugInfo.md b/llvm/docs/InstrRefDebugInfo.md
index 3917989e4026df..eb7a0464b90a0c 100644
--- a/llvm/docs/InstrRefDebugInfo.md
+++ b/llvm/docs/InstrRefDebugInfo.md
@@ -24,7 +24,7 @@ referring to instruction values:
 
 ```llvm
 %2 = add i32 %0, %1
-call void @llvm.dbg.value(metadata i32 %2,
+  #dbg_value(metadata i32 %2,
 ```
 
 In LLVM IR, the IR Value is synonymous with the instruction that computes the
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 3cd719fe297a5e..dd7290132a32c9 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -6289,14 +6289,12 @@ DIExpression
 """"""""""""
 
 ``DIExpression`` nodes represent expressions that are inspired by the DWARF
-expression language. ``DIExpression`` nodes must not be ``distinct``, and are
-canonically printed inline at each use. They are used in :ref:`debug
-intrinsics<dbg_intrinsics>` (such as ``llvm.dbg.declare`` and
-``llvm.dbg.value``) to describe how the referenced LLVM variable relates to the
-source language variable. Debug intrinsics are interpreted left-to-right: start
-by pushing the value/address operand of the intrinsic onto a stack, then
-repeatedly push and evaluate opcodes from the DIExpression until the final
-variable description is produced.
+expression language. They are used in :ref:`debug records <debugrecords>`
+(such as ``#dbg_declare`` and ``#dbg_value``) to describe how the
+referenced LLVM variable relates to the source language variable. Debug
+expressions are interpreted left-to-right: start by pushing the value/address
+operand of the record onto a stack, then repeatedly push and evaluate
+opcodes from the DIExpression until the final variable description is produced.
 
 The current supported opcode vocabulary is limited:
 
@@ -6392,23 +6390,24 @@ The current supported opcode vocabulary is limited:
 
     IR for "*ptr = 4;"
     --------------
-    call void @llvm.dbg.value(metadata i32 4, metadata !17,
-                              metadata !DIExpression(DW_OP_LLVM_implicit_pointer)))
+      #dbg_value(i32 4, !17, !DIExpression(DW_OP_LLVM_implicit_pointer), !20)
     !17 = !DILocalVariable(name: "ptr1", scope: !12, file: !3, line: 5,
                            type: !18)
     !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
     !19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+    !20 = !DILocation(line: 10, scope: !12)
 
     IR for "**ptr = 4;"
     --------------
-    call void @llvm.dbg.value(metadata i32 4, metadata !17,
-                              metadata !DIExpression(DW_OP_LLVM_implicit_pointer,
-                                                     DW_OP_LLVM_implicit_pointer)))
+      #dbg_value(i32 4, !17,
+        !DIExpression(DW_OP_LLVM_implicit_pointer, DW_OP_LLVM_implicit_pointer),
+        !21)
     !17 = !DILocalVariable(name: "ptr1", scope: !12, file: !3, line: 5,
                            type: !18)
     !18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
     !19 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !20, size: 64)
     !20 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+    !21 = !DILocation(line: 10, scope: !12)
 
 DWARF specifies three kinds of simple location descriptions: Register, memory,
 and implicit location descriptions.  Note that a location description is
@@ -6419,45 +6418,48 @@ sense that a debugger might modify its value), whereas *implicit locations*
 describe merely the actual *value* of a source variable which might not exist
 in registers or in memory (see ``DW_OP_stack_value``).
 
-A ``llvm.dbg.declare`` intrinsic describes an indirect value (the address) of a
-source variable. The first operand of the intrinsic must be an address of some
-kind. A DIExpression attached to the intrinsic refines this address to produce a
+A ``#dbg_declare`` record describes an indirect value (the address) of a
+source variable. The first operand of the record must be an address of some
+kind. A DIExpression operand to the record refines this address to produce a
 concrete location for the source variable.
 
-A ``llvm.dbg.value`` intrinsic describes the direct value of a source variable.
-The first operand of the intrinsic may be a direct or indirect value. A
-DIExpression attached to the intrinsic refines the first operand to produce a
+A ``#dbg_value`` record describes the direct value of a source variable.
+The first operand of the record may be a direct or indirect value. A
+DIExpression operand to the record refines the first operand to produce a
 direct value. For example, if the first operand is an indirect value, it may be
 necessary to insert ``DW_OP_deref`` into the DIExpression in order to produce a
-valid debug intrinsic.
+valid debug record.
 
 .. note::
 
    A DIExpression is interpreted in the same way regardless of which kind of
-   debug intrinsic it's attached to.
+   debug record it's attached to.
+
+   DIExpressions are always printed and parsed inline; they can never be
+   referenced by an ID (e.g. ``!1``).
 
 .. code-block:: text
 
-    !0 = !DIExpression(DW_OP_deref)
-    !1 = !DIExpression(DW_OP_plus_uconst, 3)
-    !1 = !DIExpression(DW_OP_constu, 3, DW_OP_plus)
-    !2 = !DIExpression(DW_OP_bit_piece, 3, 7)
-    !3 = !DIExpression(DW_OP_deref, DW_OP_constu, 3, DW_OP_plus, DW_OP_LLVM_fragment, 3, 7)
-    !4 = !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef)
-    !5 = !DIExpression(DW_OP_constu, 42, DW_OP_stack_value)
+    !DIExpression(DW_OP_deref)
+    !DIExpression(DW_OP_plus_uconst, 3)
+    !DIExpression(DW_OP_constu, 3, DW_OP_plus)
+    !DIExpression(DW_OP_bit_piece, 3, 7)
+    !DIExpression(DW_OP_deref, DW_OP_constu, 3, DW_OP_plus, DW_OP_LLVM_fragment, 3, 7)
+    !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef)
+    !DIExpression(DW_OP_constu, 42, DW_OP_stack_value)
 
 DIAssignID
 """"""""""
 
 ``DIAssignID`` nodes have no operands and are always distinct. They are used to
-link together `@llvm.dbg.assign` intrinsics (:ref:`debug
-intrinsics<dbg_intrinsics>`) and instructions that store in IR. See `Debug Info
-Assignment Tracking <AssignmentTracking.html>`_ for more info.
+link together (:ref:`#dbg_assign records <debugrecords>`) and instructions
+that store in IR. See `Debug Info Assignment Tracking
+<AssignmentTracking.html>`_ for more info.
 
 .. code-block:: llvm
 
     store i32 %a, ptr %a.addr, align 4, !DIAssignID !2
-    llvm.dbg.assign(metadata %a, metadata !1, metadata !DIExpression(), !2, metadata %a.addr, metadata !DIExpression()), !dbg !3
+    #dbg_assign(%a, !1, !DIExpression(), !2, %a.addr, !DIExpression(), !3)
 
     !2 = distinct !DIAssignID()
 
@@ -6470,19 +6472,19 @@ DIArgList
    "node", and what that actually means. The names in the implementation could
    also be updated to mirror whatever we decide here.
 
-``DIArgList`` nodes hold a list of constant or SSA value references.
-``DIArgList`` must not be ``distinct``, must only be used as an argument to a
-function call, and must appear inline at each use. ``DIArgList`` may refer to
-function-local values of the containing function. ``DIArgList`` nodes are used
-in :ref:`debug intrinsics<dbg_intrinsics>` (currently only in
-``llvm.dbg.value``) in combination with a ``DIExpression`` that uses the
-``DW_OP_LLVM_arg`` operator.
+``DIArgList`` nodes hold a list of constant or SSA value references. These are
+used in :ref:`debug records <debugrecords>` in combination with a
+``DIExpression`` that uses the
+``DW_OP_LLVM_arg`` operator. Because a DIArgList may refer to local values
+within a function, it must only be used as a function argument, must always be
+inlined, and cannot appear in named metadata.
 
 .. code-block:: text
 
-    llvm.dbg.value(metadata !DIArgList(i32 %a, i32 %b),
-                   metadata !16,
-                   metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus))
+    #dbg_value(!DIArgList(i32 %a, i32 %b),
+               !16,
+               !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus),
+               !26)
 
 DIFlags
 """""""
@@ -12961,12 +12963,12 @@ an extra level of indentation. As an example:
     #dbg_value(%inst1, !10, !DIExpression(), !11)
   %inst2 = op2 %inst1, %c
 
-These debug records are an optional replacement for
-:ref:`debug intrinsics<dbg_intrinsics>`. Debug records will be output if the
-``--write-experimental-debuginfo`` flag is passed to LLVM; it is an error for both
-records and intrinsics to appear in the same module. More information about
-debug records can be found in the `LLVM Source Level Debugging
-<SourceLevelDebugging.html#format-common-intrinsics>`_ document.
+These debug records replace the prior :ref:`debug intrinsics<dbg_intrinsics>`.
+Debug records will be disabled if ``--write-experimental-debuginfo=false`` is
+passed to LLVM; it is an error for both records and intrinsics to appear in the
+same module. More information about debug records can be found in the `LLVM
+Source Level Debugging <SourceLevelDebugging.html#format-common-intrinsics>`_
+document.
 
 .. _intrinsics:
 
@@ -26248,6 +26250,42 @@ same values as the libm ``cos`` functions would, and handles error
 conditions in the same way.
 
 
+'``llvm.experimental.constrained.tan``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare <type>
+      @llvm.experimental.constrained.tan(<type> <op1>,
+                                         metadata <rounding mode>,
+                                         metadata <exception behavior>)
+
+Overview:
+"""""""""
+
+The '``llvm.experimental.constrained.tan``' intrinsic returns the tangent of the
+first operand.
+
+Arguments:
+""""""""""
+
+The first argument and the return type are floating-point numbers of the same
+type.
+
+The second and third arguments specify the rounding mode and exception
+behavior as described above.
+
+Semantics:
+""""""""""
+
+This function returns the tangent of the specified operand, returning the
+same values as the libm ``tan`` functions would, and handles error
+conditions in the same way.
+
+
 '``llvm.experimental.constrained.exp``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/MIRLangRef.rst b/llvm/docs/MIRLangRef.rst
index ec29870128c1d1..b4b59dbfa8ca4e 100644
--- a/llvm/docs/MIRLangRef.rst
+++ b/llvm/docs/MIRLangRef.rst
@@ -883,16 +883,16 @@ Where:
 
 - ``debug-info-location`` identifies a DILocation metadata node.
 
-These metadata attributes correspond to the operands of a ``llvm.dbg.declare``
-IR intrinsic, see the :ref:`source level debugging<format_common_intrinsics>`
-documentation.
+These metadata attributes correspond to the operands of a ``#dbg_declare``
+IR debug record, see the :ref:`source level
+debugging<debug_records>` documentation.
 
 Varying variable locations
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 Variables that are not always on the stack or change location are specified
 with the ``DBG_VALUE``  meta machine instruction. It is synonymous with the
-``llvm.dbg.value`` IR intrinsic, and is written:
+``#dbg_value`` IR record, and is written:
 
 .. code-block:: text
 
diff --git a/llvm/docs/Passes.rst b/llvm/docs/Passes.rst
index 2edad5cd3c881a..49f633e98d16fe 100644
--- a/llvm/docs/Passes.rst
+++ b/llvm/docs/Passes.rst
@@ -935,8 +935,9 @@ declarations and removes them.  Dead declarations are declarations of functions
 for which no implementation is available (i.e., declarations for unused library
 functions).
 
-``strip-debug-declare``: Strip all ``llvm.dbg.declare`` intrinsics
-------------------------------------------------------------------
+``strip-debug-declare``: Strip all ``llvm.dbg.declare`` intrinsics and
+``#dbg_declare`` records.
+-------------------------------------------------------------------
 
 Performs code stripping. Similar to strip, but only strips
 ``llvm.dbg.declare`` intrinsics.
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 35115e67ecf924..ef06f80c747f94 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -85,6 +85,7 @@ on support follow.
      Extension         Status
      ================  =================================================================
      ``A``             Supported
+     ``B``             Supported
      ``C``             Supported
      ``D``             Supported
      ``F``             Supported
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index b711be3012eb83..7b0cbe5ecfcc5b 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -61,6 +61,10 @@ Changes to the LLVM IR
 
   * ``icmp``
   * ``fcmp``
+* LLVM has switched from using debug intrinsics in textual IR to using debug
+  records by default. Details of the change and instructions on how to update
+  any downstream tools and tests can be found in the `migration docs
+  <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
 
 Changes to LLVM infrastructure
 ------------------------------
@@ -147,6 +151,7 @@ Changes to the RISC-V Backend
 * Zaamo and Zalrsc are no longer experimental.
 * Processors that enable post reg-alloc scheduling (PostMachineScheduler) by default should use the `UsePostRAScheduler` subtarget feature. Setting `PostRAScheduler = 1` in the scheduler model will have no effect on the enabling of the PostMachineScheduler.
 * Zabha is no longer experimental.
+* B (the collection of the Zba, Zbb, Zbs extensions) is supported.
 
 Changes to the WebAssembly Backend
 ----------------------------------
@@ -220,13 +225,6 @@ Changes to the Metadata Info
 Changes to the Debug Info
 ---------------------------------
 
-* LLVM has switched from using debug intrinsics internally to using debug
-  records by default. This should happen transparently when using the DIBuilder
-  to construct debug variable information, but will require changes for any code
-  that interacts with debug intrinsics directly. Debug intrinsics will only be
-  supported on a best-effort basis from here onwards; for more information, see
-  the `migration docs <https://llvm.org/docs/RemoveDIsDebugInfo.html>`_.
-
 Changes to the LLVM tools
 ---------------------------------
 * llvm-nm and llvm-objdump can now print symbol information from linked
diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst
index 0e6ef5d9872286..82be253ffc23cc 100644
--- a/llvm/docs/SourceLevelDebugging.rst
+++ b/llvm/docs/SourceLevelDebugging.rst
@@ -43,7 +43,7 @@ important ones are:
   debuggers, like GDB or DBX.
 
 The approach used by the LLVM implementation is to use a small set of
-:ref:`intrinsic functions <format_common_intrinsics>` to define a mapping
+:ref:`debug records <debug_records>` to define a mapping
 between LLVM program objects and the source-level objects.  The description of
 the source-level program is maintained in LLVM metadata in an
 :ref:`implementation-defined format <ccxx_frontend>` (the C/C++ front-end
@@ -169,83 +169,99 @@ Debug information descriptors are `specialized metadata nodes
 
 There are two models for defining the values of source variables at different
 states of the program and tracking these values through optimization and code
-generation: :ref:`intrinsic function calls <format_common_intrinsics>`, the
-current default, and :ref:`debug records <debug_records>`, which are a new
-non-instruction-based model
-(for an explanation of how this works and why it is desirable, see the
-`RemoveDIs <RemoveDIsDebugInfo.html>`_ document). Each module must use one or
-the other; they may never be mixed within an IR module. To enable writing debug
-records instead of intrinsic calls, use the flag
-``--write-experimental-debuginfo``.
+generation: :ref:`debug records <debug_records>`, the current default, and
+:ref:`intrinsic function calls <format_common_intrinsics>`, which are
+non-default but currently supported for backwards compatibility - though these
+two models must never be mixed within an IR module. For an explanation of why
+we changed to the new model, how it works, and guidance on how to update old
+code or IR to use debug records, see the `RemoveDIs <RemoveDIsDebugInfo.html>`_
+document.
 
-.. _format_common_intrinsics:
+.. _debug_records:
 
-Debugger intrinsic functions
+Debug Records
 ----------------------------
 
-LLVM uses several intrinsic functions (name prefixed with "``llvm.dbg``") to
-track source local variables through optimization and code generation.
+Debug records define the value that a source variable has during execution of
+the program; they appear interleaved with instructions, although they are not
+instructions themselves and have no effect on the code generated by the
+compiler.
 
-``llvm.dbg.declare``
-^^^^^^^^^^^^^^^^^^^^
+LLVM uses several types of debug records to define source variables. The
+common syntax for these records is:
 
 .. code-block:: llvm
 
-  void @llvm.dbg.declare(metadata, metadata, metadata)
+    #dbg_<kind>([<arg>, ]* <DILocation>)
+  ; Using the intrinsic model, the above is equivalent to:
+  call void llvm.dbg.<kind>([metadata <arg>, ]*), !dbg <DILocation>
+
+Debug records are always printed with an extra level of indentation compared
+to instructions, and always have the prefix `#dbg_` and a list of
+comma-separated arguments in parentheses, as with a `call`.
+
+``#dbg_declare``
+^^^^^^^^^^^^^^^^
+
+.. code-block:: llvm
+
+    #dbg_declare([Value|MDNode], DILocalVariable, DIExpression, DILocation)
 
-This intrinsic provides information about a local element (e.g., variable).
-The first argument is metadata holding the address of variable, typically a
-static alloca in the function entry block.  The second argument is a
+This record provides information about a local element (e.g., variable).
+The first argument is an SSA value corresponding to a variable address, and is
+typically a static alloca in the function entry block.  The second argument is a
 `local variable <LangRef.html#dilocalvariable>`_ containing a description of
 the variable.  The third argument is a `complex expression
-<LangRef.html#diexpression>`_.  An `llvm.dbg.declare` intrinsic describes the
+<LangRef.html#diexpression>`_. The fourth argument is a `source location
+<LangRef.html#dilocation>`_. A ``#dbg_declare`` record describes the
 *address* of a source variable.
 
-.. code-block:: text
+.. code-block:: llvm
 
     %i.addr = alloca i32, align 4
-    call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !1,
-                                metadata !DIExpression()), !dbg !2
+      #dbg_declare(ptr %i.addr, !1, !DIExpression(), !2)
+    ; ...
     !1 = !DILocalVariable(name: "i", ...) ; int i
     !2 = !DILocation(...)
-    ...
+    ; ...
     %buffer = alloca [256 x i8], align 8
     ; The address of i is buffer+64.
-    call void @llvm.dbg.declare(metadata [256 x i8]* %buffer, metadata !3,
-                               metadata !DIExpression(DW_OP_plus, 64)), !dbg !4
+      #dbg_declare(ptr %buffer, !3, !DIExpression(DW_OP_plus, 64), !4)
+    ; ...
     !3 = !DILocalVariable(name: "i", ...) ; int i
     !4 = !DILocation(...)
 
-A frontend should generate exactly one call to ``llvm.dbg.declare`` at the point
+A frontend should generate exactly one ``#dbg_declare`` record at the point
 of declaration of a source variable. Optimization passes that fully promote the
-variable from memory to SSA values will replace this call with possibly multiple
-calls to `llvm.dbg.value`. Passes that delete stores are effectively partial
-promotion, and they will insert a mix of calls to ``llvm.dbg.value`` to track
-the source variable value when it is available. After optimization, there may be
-multiple calls to ``llvm.dbg.declare`` describing the program points where the
-variables lives in memory. All calls for the same concrete source variable must
-agree on the memory location.
+variable from memory to SSA values will replace this record with possibly
+multiple ``#dbg_value``` records. Passes that delete stores are effectively
+partial promotion, and they will insert a mix of ``#dbg_value`` records to
+track the source variable value when it is available. After optimization, there
+may be multiple ``#dbg_declare`` records describing the program points where
+the variables lives in memory. All calls for the same concrete source variable
+must agree on the memory location.
 
 
-``llvm.dbg.value``
-^^^^^^^^^^^^^^^^^^
+``#dbg_value``
+^^^^^^^^^^^^^^
 
 .. code-block:: llvm
 
-  void @llvm.dbg.value(metadata, metadata, metadata)
+    #dbg_value([Value|DIArgList|MDNode], DILocalVariable, DIExpression, DILocation)
 
-This intrinsic provides information when a user source variable is set to a new
-value.  The first argument is the new value (wrapped as metadata).  The second
-argument is a `local variable <LangRef.html#dilocalvariable>`_ containing a
-description of the variable.  The third argument is a `complex expression
-<LangRef.html#diexpression>`_.
+This record provides information when a user source variable is set to a new
+value.  The first argument is the new value. The second argument is a `local
+variable <LangRef.html#dilocalvariable>`_ containing a description of the
+variable.  The third argument is a `complex expression
+<LangRef.html#diexpression>`_. The fourth argument is a `source location
+<LangRef.html#dilocation>`_.
 
-An `llvm.dbg.value` intrinsic describes the *value* of a source variable
+A ``#dbg_value`` record describes the *value* of a source variable
 directly, not its address.  Note that the value operand of this intrinsic may
 be indirect (i.e, a pointer to the source variable), provided that interpreting
 the complex expression derives the direct value.
 
-``llvm.dbg.assign``
+``#dbg_assign``
 ^^^^^^^^^^^^^^^^^^^
 .. toctree::
    :hidden:
@@ -254,85 +270,87 @@ the complex expression derives the direct value.
 
 .. code-block:: llvm
 
-  void @llvm.dbg.assign(Value *Value,
-                        DIExpression *ValueExpression,
-                        DILocalVariable *Variable,
-                        DIAssignID *ID,
-                        Value *Address,
-                        DIExpression *AddressExpression)
+  #dbg_assign( [Value|DIArgList|MDNode] Value,
+               DILocalVariable Variable,
+               DIExpression ValueExpression,
+               DIAssignID ID,
+               [Value|MDNode] Address,
+               DIExpression AddressExpression,
+               DILocation SourceLocation )
 
-This intrinsic marks the position in IR where a source assignment occurred. It
+This record marks the position in IR where a source assignment occurred. It
 encodes the value of the variable. It references the store, if any, that
 performs the assignment, and the destination address.
 
-The first three arguments are the same as for an ``llvm.dbg.value``. The fourth
+The first three arguments are the same as for a ``#dbg_value``. The fourth
 argument is a ``DIAssignID`` used to reference a store. The fifth is the
-destination of the store (wrapped as metadata), and the sixth is a `complex
-expression <LangRef.html#diexpression>`_ that modifies it.
+destination of the store, the sixth is a `complex
+expression <LangRef.html#diexpression>`_ that modfies it, and the seventh is a
+`source location <LangRef.html#dilocation>`_.
 
-The formal LLVM-IR signature is:
+See :doc:`AssignmentTracking` for more info.
 
-.. code-block:: llvm
+Debugger intrinsic functions
+----------------------------
 
-  void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+.. _format_common_intrinsics:
 
+In intrinsic-mode, LLVM uses several intrinsic functions (name prefixed with "``llvm.dbg``") to
+track source local variables through optimization and code generation. These
+intrinsic functions each correspond to one of the debug records above, with a
+few syntactic differences: each argument to a debugger intrinsic must be wrapped
+as metadata, meaning it must be prefixed with ``metadata``, and the
+``DILocation`` argument in each record must be a metadata attachment to the
+call instruction, meaning it appears after the argument list with the prefix
+``!dbg``.
 
-See :doc:`AssignmentTracking` for more info.
+``llvm.dbg.declare``
+^^^^^^^^^^^^^^^^^^^^
 
-.. _debug_records:
+.. code-block:: llvm
 
-Debug Records
-----------------------------
+  void @llvm.dbg.declare(metadata, metadata, metadata)
 
-LLVM also has an alternative to intrinsic functions, debug records, which
-function similarly but are not instructions. The basic syntax for debug records
-is:
+This intrinsic is equivalent to ``#dbg_declare``:
 
 .. code-block:: llvm
 
-    #dbg_<kind>([<arg>, ]* <DILocation>)
-  ; Using the intrinsic model, the above is equivalent to:
-  call void llvm.dbg.<kind>([metadata <arg>, ]*), !dbg <DILocation>
+      #dbg_declare(i32* %i.addr, !1, !DIExpression(), !2)
+    call void @llvm.dbg.declare(metadata i32* %i.addr, metadata !1,
+                                metadata !DIExpression()), !dbg !2
 
-A debug intrinsic function can be converted to a debug record with the
-following steps:
+``llvm.dbg.value``
+^^^^^^^^^^^^^^^^^^
 
-1. Add an extra level of indentation.
-2. Replace everything prior to the intrinsic kind (declare/value/assign) with
-   ``#dbg_``.
-3. Remove the leading ``metadata`` from the intrinsic's arguments.
-4. Transfer the ``!dbg`` attachment to be an argument, dropping the leading
-   ``!dbg``.
+.. code-block:: llvm
 
-For each kind of intrinsic function, there is an equivalent debug record.
+  void @llvm.dbg.value(metadata, metadata, metadata)
 
-``#dbg_declare``
-^^^^^^^^^^^^^^^^
+This intrinsic is equivalent to ``#dbg_value``:
 
 .. code-block:: llvm
 
-    #dbg_declare([Value|MDNode], DILocalVariable, DIExpression, DILocation)
-
-Equivalent to the ``llvm.dbg.declare`` intrinsic.
+      #dbg_value(i32 %i, !1, !DIExpression(), !2)
+    call void @llvm.dbg.value(metadata i32 %i, metadata !1,
+                              metadata !DIExpression()), !dbg !2
 
-``#dbg_value``
-^^^^^^^^^^^^^^
+``llvm.dbg.assign``
+^^^^^^^^^^^^^^^^^^^
 
 .. code-block:: llvm
 
-    #dbg_value([Value|DIArgList|MDNode], DILocalVariable, DIExpression, DILocation)
-
-Equivalent to the ``llvm.dbg.value`` intrinsic.
+  void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
 
-``#dbg_assign``
-^^^^^^^^^^^^^^^
+This intrinsic is equivalent to ``#dbg_assign``:
 
 .. code-block:: llvm
 
-    #dbg_assign([Value|DIArgList|MDNode], DILocalVariable, DIExpression,
-                DIAssignID, [Value|MDNode], DIExpression, DILocation)
+      #dbg_assign(i32 %i, !1, !DIExpression(), !2, 
+                  ptr %i.addr, !DIExpression(), !3)
+    call void @llvm.dbg.assign(
+      metadata i32 %i, metadata !1, metadata !DIExpression(), metadata !2,
+      metadata ptr %i.addr, metadata !DIExpression(), metadata !3), !dbg !3
 
-Equivalent to the ``llvm.dbg.assign`` intrinsic.
 
 Object lifetimes and scoping
 ============================
@@ -371,22 +389,19 @@ Compiled to LLVM, this function would be represented like this:
     %X = alloca i32, align 4
     %Y = alloca i32, align 4
     %Z = alloca i32, align 4
-    call void @llvm.dbg.declare(metadata i32* %X, metadata !11, metadata !DIExpression()), !dbg !13
-    store i32 21, i32* %X, align 4, !dbg !13
-    call void @llvm.dbg.declare(metadata i32* %Y, metadata !14, metadata !DIExpression()), !dbg !15
-    store i32 22, i32* %Y, align 4, !dbg !15
-    call void @llvm.dbg.declare(metadata i32* %Z, metadata !16, metadata !DIExpression()), !dbg !18
-    store i32 23, i32* %Z, align 4, !dbg !18
-    %0 = load i32, i32* %X, align 4, !dbg !19
-    store i32 %0, i32* %Z, align 4, !dbg !20
-    %1 = load i32, i32* %Y, align 4, !dbg !21
-    store i32 %1, i32* %X, align 4, !dbg !22
-    ret void, !dbg !23
+      #dbg_declare(ptr %X, !11, !DIExpression(), !14)
+    store i32 21, i32* %X, align 4, !dbg !14
+      #dbg_declare(ptr %Y, !15, !DIExpression(), !16)
+    store i32 22, i32* %Y, align 4, !dbg !16
+      #dbg_declare(ptr %Z, !17, !DIExpression(), !19)
+    store i32 23, i32* %Z, align 4, !dbg !19
+    %0 = load i32, i32* %X, align 4, !dbg !20
+    store i32 %0, i32* %Z, align 4, !dbg !21
+    %1 = load i32, i32* %Y, align 4, !dbg !22
+    store i32 %1, i32* %X, align 4, !dbg !23
+    ret void, !dbg !24
   }
 
-  ; Function Attrs: nounwind readnone
-  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
   attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
 
@@ -410,29 +425,29 @@ Compiled to LLVM, this function would be represented like this:
   !13 = !DILocation(line: 2, column: 9, scope: !4)
   !14 = !DILocalVariable(name: "Y", scope: !4, file: !1, line: 3, type: !12)
   !15 = !DILocation(line: 3, column: 9, scope: !4)
-  !16 = !DILocalVariable(name: "Z", scope: !17, file: !1, line: 5, type: !12)
+  !16 = !DILocalVariable(name: "Z", scope: !18, file: !1, line: 5, type: !12)
   !17 = distinct !DILexicalBlock(scope: !4, file: !1, line: 4, column: 5)
-  !18 = !DILocation(line: 5, column: 11, scope: !17)
-  !19 = !DILocation(line: 6, column: 11, scope: !17)
-  !20 = !DILocation(line: 6, column: 9, scope: !17)
+  !18 = !DILocation(line: 5, column: 11, scope: !18)
+  !29 = !DILocation(line: 6, column: 11, scope: !18)
+  !20 = !DILocation(line: 6, column: 9, scope: !18)
   !21 = !DILocation(line: 8, column: 9, scope: !4)
   !22 = !DILocation(line: 8, column: 7, scope: !4)
   !23 = !DILocation(line: 9, column: 3, scope: !4)
 
 
 This example illustrates a few important details about LLVM debugging
-information.  In particular, it shows how the ``llvm.dbg.declare`` intrinsic and
+information.  In particular, it shows how the ``#dbg_declare`` record and
 location information, which are attached to an instruction, are applied
 together to allow a debugger to analyze the relationship between statements,
 variable definitions, and the code used to implement the function.
 
 .. code-block:: llvm
 
-  call void @llvm.dbg.declare(metadata i32* %X, metadata !11, metadata !DIExpression()), !dbg !13
+    #dbg_declare(ptr %X, !11, !DIExpression(), !14)
     ; [debug line = 2:7] [debug variable = X]
 
-The first intrinsic ``%llvm.dbg.declare`` encodes debugging information for the
-variable ``X``.  The metadata ``!dbg !13`` attached to the intrinsic provides
+The first record ``#dbg_declare`` encodes debugging information for the
+variable ``X``.  The location ``!14`` at the end of the record provides
 scope information for the variable ``X``.
 
 .. code-block:: text
@@ -445,18 +460,26 @@ scope information for the variable ``X``.
 Here ``!13`` is metadata providing `location information
 <LangRef.html#dilocation>`_.  In this example, scope is encoded by ``!4``, a
 `subprogram descriptor <LangRef.html#disubprogram>`_.  This way the location
-information attached to the intrinsics indicates that the variable ``X`` is
+information parameter to the records indicates that the variable ``X`` is
 declared at line number 2 at a function level scope in function ``foo``.
 
 Now lets take another example.
 
 .. code-block:: llvm
 
+<<<<<<< HEAD
   call void @llvm.dbg.declare(metadata i32* %Z, metadata !16, metadata !DIExpression()), !dbg !18
     ; [debug line = 5:9] [debug variable = Z]
 
 The third intrinsic ``%llvm.dbg.declare`` encodes debugging information for
 variable ``Z``.  The metadata ``!dbg !18`` attached to the intrinsic provides
+=======
+    #dbg_declare(ptr %Z, !17, !DIExpression(), !19)
+    ; [debug line = 5:9] [debug variable = Z]
+
+The third record ``#dbg_declare`` encodes debugging information for
+variable ``Z``.  The metadata ``!19`` at the end of the record provides
+>>>>>>> 38ccee00346300c87abc34860398bc950c65eaec
 scope information for the variable ``Z``.
 
 .. code-block:: text
@@ -478,18 +501,18 @@ In the example above, every variable assignment uniquely corresponds to a
 memory store to the variable's position on the stack. However in heavily
 optimized code LLVM promotes most variables into SSA values, which can
 eventually be placed in physical registers or memory locations. To track SSA
-values through compilation, when objects are promoted to SSA values an
-``llvm.dbg.value`` intrinsic is created for each assignment, recording the
-variable's new location. Compared with the ``llvm.dbg.declare`` intrinsic:
+values through compilation, when objects are promoted to SSA values a
+``#dbg_value`` record is created for each assignment, recording the
+variable's new location. Compared with the ``#dbg_declare`` record:
 
-* A dbg.value terminates the effect of any preceding dbg.values for (any
+* A #dbg_value terminates the effect of any preceding #dbg_values for (any
   overlapping fragments of) the specified variable.
-* The dbg.value's position in the IR defines where in the instruction stream
+* The #dbg_value's position in the IR defines where in the instruction stream
   the variable's value changes.
 * Operands can be constants, indicating the variable is assigned a
   constant value.
 
-Care must be taken to update ``llvm.dbg.value`` intrinsics when optimization
+Care must be taken to update ``#dbg_value`` records when optimization
 passes alter or move instructions and blocks -- the developer could observe such
 changes reflected in the value of variables when debugging the program. For any
 execution of the optimized program, the set of variable values presented to the
@@ -500,7 +523,7 @@ damaging their understanding of the optimized program and undermining their
 trust in the debugger.
 
 Sometimes perfectly preserving variable locations is not possible, often when a
-redundant calculation is optimized out. In such cases, a ``llvm.dbg.value``
+redundant calculation is optimized out. In such cases, a ``#dbg_value``
 with operand ``poison`` should be used, to terminate earlier variable locations
 and let the debugger present ``optimized out`` to the developer. Withholding
 these potentially stale variable values from the developer diminishes the
@@ -513,26 +536,26 @@ To illustrate some potential issues, consider the following example:
 
   define i32 @foo(i32 %bar, i1 %cond) {
   entry:
-    call @llvm.dbg.value(metadata i32 0, metadata !1, metadata !2)
+      #dbg_value(i32 0, !1, !DIExpression(), !4)
     br i1 %cond, label %truebr, label %falsebr
   truebr:
     %tval = add i32 %bar, 1
-    call @llvm.dbg.value(metadata i32 %tval, metadata !1, metadata !2)
+      #dbg_value(i32 %tval, !1, !DIExpression(), !4)
     %g1 = call i32 @gazonk()
     br label %exit
   falsebr:
     %fval = add i32 %bar, 2
-    call @llvm.dbg.value(metadata i32 %fval, metadata !1, metadata !2)
+      #dbg_value(i32 %fval, !1, !DIExpression(), !4)
     %g2 = call i32 @gazonk()
     br label %exit
   exit:
     %merge = phi [ %tval, %truebr ], [ %fval, %falsebr ]
     %g = phi [ %g1, %truebr ], [ %g2, %falsebr ]
-    call @llvm.dbg.value(metadata i32 %merge, metadata !1, metadata !2)
-    call @llvm.dbg.value(metadata i32 %g, metadata !3, metadata !2)
+      #dbg_value(i32 %merge, !1, !DIExpression(), !4)
+      #dbg_value(i32 %g, !3, !DIExpression(), !4)
     %plusten = add i32 %merge, 10
     %toret = add i32 %plusten, %g
-    call @llvm.dbg.value(metadata i32 %toret, metadata !1, metadata !2)
+      #dbg_value(i32 %toret, !1, !DIExpression(), !4)
     ret i32 %toret
   }
 
@@ -550,48 +573,48 @@ perhaps, be optimized into the following code:
     ret i32 %toret
   }
 
-What ``llvm.dbg.value`` intrinsics should be placed to represent the original variable
+What ``#dbg_value`` records should be placed to represent the original variable
 locations in this code? Unfortunately the second, third and fourth
-dbg.values for ``!1`` in the source function have had their operands
+#dbg_values for ``!1`` in the source function have had their operands
 (%tval, %fval, %merge) optimized out. Assuming we cannot recover them, we
-might consider this placement of dbg.values:
+might consider this placement of #dbg_values:
 
 .. code-block:: llvm
 
   define i32 @foo(i32 %bar, i1 %cond) {
   entry:
-    call @llvm.dbg.value(metadata i32 0, metadata !1, metadata !2)
+      #dbg_value(i32 0, !1, !DIExpression(), !4)
     %g = call i32 @gazonk()
-    call @llvm.dbg.value(metadata i32 %g, metadata !3, metadata !2)
+      #dbg_value(i32 %g, !3, !DIExpression(), !4)
     %addoper = select i1 %cond, i32 11, i32 12
     %plusten = add i32 %bar, %addoper
     %toret = add i32 %plusten, %g
-    call @llvm.dbg.value(metadata i32 %toret, metadata !1, metadata !2)
+      #dbg_value(i32 %toret, !1, !DIExpression(), !4)
     ret i32 %toret
   }
 
 However, this will cause ``!3`` to have the return value of ``@gazonk()`` at
 the same time as ``!1`` has the constant value zero -- a pair of assignments
 that never occurred in the unoptimized program. To avoid this, we must terminate
-the range that ``!1`` has the constant value assignment by inserting a poison
-dbg.value before the dbg.value for ``!3``:
+the range that ``!1`` has the constant value assignment by inserting an poison
+#dbg_value before the #dbg_value for ``!3``:
 
 .. code-block:: llvm
 
   define i32 @foo(i32 %bar, i1 %cond) {
   entry:
-    call @llvm.dbg.value(metadata i32 0, metadata !1, metadata !2)
+      #dbg_value(i32 0, !1, !DIExpression(), !2)
     %g = call i32 @gazonk()
-    call @llvm.dbg.value(metadata i32 poison, metadata !1, metadata !2)
-    call @llvm.dbg.value(metadata i32 %g, metadata !3, metadata !2)
+      #dbg_value(i32 poison, !1, !DIExpression(), !2)
+      #dbg_value(i32 %g, !3, !DIExpression(), !2)
     %addoper = select i1 %cond, i32 11, i32 12
     %plusten = add i32 %bar, %addoper
     %toret = add i32 %plusten, %g
-    call @llvm.dbg.value(metadata i32 %toret, metadata !1, metadata !2)
+      #dbg_value(i32 %toret, !1, !DIExpression(), !2)
     ret i32 %toret
   }
 
-There are a few other dbg.value configurations that mean it terminates
+There are a few other #dbg_value configurations that mean it terminates
 dominating location definitions without adding a new location. The complete
 list is:
 
@@ -601,16 +624,16 @@ list is:
 * There are no location operands (empty ``DIArgList``) and the ``DIExpression``
   is empty.
 
-This class of dbg.value that kills variable locations is called a "kill
-dbg.value" or "kill location", and for legacy reasons the term "undef
-dbg.value" may be used in existing code. The ``DbgVariableIntrinsic`` methods
+This class of #dbg_value that kills variable locations is called a "kill
+#dbg_value" or "kill location", and for legacy reasons the term "undef
+#dbg_value" may be used in existing code. The ``DbgVariableIntrinsic`` methods
 ``isKillLocation`` and ``setKillLocation`` should be used where possible rather
-than inspecting location operands directly to check or set whether a dbg.value
+than inspecting location operands directly to check or set whether a #dbg_value
 is a kill location.
 
-In general, if any dbg.value has its operand optimized out and cannot be
-recovered, then a kill dbg.value is necessary to terminate earlier variable
-locations. Additional kill dbg.values may be necessary when the debugger can
+In general, if any #dbg_value has its operand optimized out and cannot be
+recovered, then a kill #dbg_value is necessary to terminate earlier variable
+locations. Additional kill #dbg_values may be necessary when the debugger can
 observe re-ordering of assignments.
 
 How variable location metadata is transformed during CodeGen
@@ -621,9 +644,9 @@ ultimately producing a mapping between source-level information and
 instruction ranges. This
 is relatively straightforwards for line number information, as mapping
 instructions to line numbers is a simple association. For variable locations
-however the story is more complex. As each ``llvm.dbg.value`` intrinsic
+however the story is more complex. As each ``#dbg_value`` record
 represents a source-level assignment of a value to a source variable, the
-variable location intrinsics effectively embed a small imperative program
+debug records effectively embed a small imperative program
 within the LLVM IR. By the end of CodeGen, this becomes a mapping from each
 variable to their machine locations over ranges of instructions.
 From IR to object emission, the major transformations which affect variable
@@ -638,7 +661,7 @@ significantly change the ordering of the program, and occurs in a number of
 different passes.
 
 Some variable locations are not transformed during CodeGen. Stack locations
-specified by ``llvm.dbg.declare`` are valid and unchanging for the entire
+specified by ``#dbg_declare`` are valid and unchanging for the entire
 duration of the function, and are recorded in a simple MachineFunction table.
 Location changes in the prologue and epilogue of a function are also ignored:
 frame setup and destruction may take several instructions, require a
@@ -667,11 +690,11 @@ otherwise transformed into a non-register, the variable location becomes
 unavailable.
 
 Locations that are unavailable are treated as if they have been optimized out:
-in IR the location would be assigned ``undef`` by a debug intrinsic, and in MIR
+in IR the location would be assigned ``undef`` by a debug record, and in MIR
 the equivalent location is used.
 
 After MIR locations are assigned to each variable, machine pseudo-instructions
-corresponding to each ``llvm.dbg.value`` intrinsic are inserted. There are two
+corresponding to each ``#dbg_value`` record are inserted. There are two
 forms of this type of instruction.
 
 The first form, ``DBG_VALUE``, appears thus:
@@ -683,14 +706,14 @@ The first form, ``DBG_VALUE``, appears thus:
 And has the following operands:
  * The first operand can record the variable location as a register,
    a frame index, an immediate, or the base address register if the original
-   debug intrinsic referred to memory. ``$noreg`` indicates the variable
-   location is undefined, equivalent to an ``undef`` dbg.value operand.
+   debug record referred to memory. ``$noreg`` indicates the variable
+   location is undefined, equivalent to an ``undef`` #dbg_value operand.
  * The type of the second operand indicates whether the variable location is
    directly referred to by the DBG_VALUE, or whether it is indirect. The
    ``$noreg`` register signifies the former, an immediate operand (0) the
    latter.
- * Operand 3 is the Variable field of the original debug intrinsic.
- * Operand 4 is the Expression field of the original debug intrinsic.
+ * Operand 3 is the Variable field of the original debug record.
+ * Operand 4 is the Expression field of the original debug record.
 
 The second form, ``DBG_VALUE_LIST``, appears thus:
 
@@ -699,8 +722,8 @@ The second form, ``DBG_VALUE_LIST``, appears thus:
   DBG_VALUE_LIST !123, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_arg, 1, DW_OP_plus), %1, %2
 
 And has the following operands:
- * The first operand is the Variable field of the original debug intrinsic.
- * The second operand is the Expression field of the original debug intrinsic.
+ * The first operand is the Variable field of the original debug record.
+ * The second operand is the Expression field of the original debug record.
  * Any number of operands, from the 3rd onwards, record a sequence of variable
    location operands, which may take any of the same values as the first
    operand of the ``DBG_VALUE`` instruction above. These variable location
@@ -709,7 +732,7 @@ And has the following operands:
    <LangRef.html#diexpression>`_.
 
 The position at which the DBG_VALUEs are inserted should correspond to the
-positions of their matching ``llvm.dbg.value`` intrinsics in the IR block.  As
+positions of their matching ``#dbg_value`` records in the IR block.  As
 with optimization, LLVM aims to preserve the order in which variable
 assignments occurred in the source program. However SelectionDAG performs some
 instruction scheduling, which can reorder assignments (discussed below).
@@ -723,20 +746,20 @@ the following example:
 
   define i32 @foo(i32* %addr) {
   entry:
-    call void @llvm.dbg.value(metadata i32 0, metadata !3, metadata !DIExpression()), !dbg !5
+      #dbg_value(i32 0, !3, !DIExpression(), !5)
     br label %bb1, !dbg !5
 
   bb1:                                              ; preds = %bb1, %entry
     %bar.0 = phi i32 [ 0, %entry ], [ %add, %bb1 ]
-    call void @llvm.dbg.value(metadata i32 %bar.0, metadata !3, metadata !DIExpression()), !dbg !5
+      #dbg_value(i32 %bar.0, !3, !DIExpression(), !5)
     %addr1 = getelementptr i32, i32 *%addr, i32 1, !dbg !5
-    call void @llvm.dbg.value(metadata i32 *%addr1, metadata !3, metadata !DIExpression()), !dbg !5
+      #dbg_value(i32 *%addr1, !3, !DIExpression(), !5)
     %loaded1 = load i32, i32* %addr1, !dbg !5
     %addr2 = getelementptr i32, i32 *%addr, i32 %bar.0, !dbg !5
-    call void @llvm.dbg.value(metadata i32 *%addr2, metadata !3, metadata !DIExpression()), !dbg !5
+      #dbg_value(i32 *%addr2, !3, !DIExpression(), !5)
     %loaded2 = load i32, i32* %addr2, !dbg !5
     %add = add i32 %bar.0, 1, !dbg !5
-    call void @llvm.dbg.value(metadata i32 %add, metadata !3, metadata !DIExpression()), !dbg !5
+      #dbg_value(i32 %add, !3, !DIExpression(), !5)
     %added = add i32 %loaded1, %loaded2
     %cond = icmp ult i32 %added, %bar.0, !dbg !5
     br i1 %cond, label %bb1, label %bb2, !dbg !5
@@ -778,12 +801,12 @@ If one compiles this IR with ``llc -o - -start-after=codegen-prepare -stop-after
     $eax = COPY %8, debug-location !5
     RET 0, $eax, debug-location !5
 
-Observe first that there is a DBG_VALUE instruction for every ``llvm.dbg.value``
-intrinsic in the source IR, ensuring no source level assignments go missing.
+Observe first that there is a DBG_VALUE instruction for every ``#dbg_value``
+record in the source IR, ensuring no source level assignments go missing.
 Then consider the different ways in which variable locations have been recorded:
 
-* For the first dbg.value an immediate operand is used to record a zero value.
-* The dbg.value of the PHI instruction leads to a DBG_VALUE of virtual register
+* For the first #dbg_value an immediate operand is used to record a zero value.
+* The #dbg_value of the PHI instruction leads to a DBG_VALUE of virtual register
   ``%0``.
 * The first GEP has its effect folded into the first load instruction
   (as a 4-byte offset), but the variable location is salvaged by folding
@@ -791,7 +814,7 @@ Then consider the different ways in which variable locations have been recorded:
 * The second GEP is also folded into the corresponding load. However, it is
   insufficiently simple to be salvaged, and is emitted as a ``$noreg``
   DBG_VALUE, indicating that the variable takes on an undefined location.
-* The final dbg.value has its Value placed in virtual register ``%1``.
+* The final #dbg_value has its Value placed in virtual register ``%1``.
 
 Instruction Scheduling
 ----------------------
@@ -898,14 +921,19 @@ presents several difficulties:
     br label %exit, !dbg !26
 
   truebr:
+<<<<<<< HEAD
     call void @llvm.dbg.value(metadata i32 %input, metadata !30, metadata !DIExpression()), !dbg !23
     call void @llvm.dbg.value(metadata i32 1, metadata !22, metadata !DIExpression()), !dbg !23
+=======
+      #dbg_value(i32 %input, !30, !DIExpression(), !24)
+      #dbg_value(i32 1, !23, !DIExpression(), !24)
+>>>>>>> 38ccee00346300c87abc34860398bc950c65eaec
     %value1 = add i32 %input, 1
     br label %bb1
 
   falsebr:
-    call void @llvm.dbg.value(metadata i32 %input, metadata !30, metadata !DIExpression()), !dbg !24
-    call void @llvm.dbg.value(metadata i32 2, metadata !23, metadata !DIExpression()), !dbg !24
+      #dbg_value(i32 %input, !30, !DIExpression(), !24)
+      #dbg_value(i32 2, !23, !DIExpression(), !24)
     %value2 = add i32 %input, 2
     br label %bb1
 
@@ -919,13 +947,13 @@ Here the difficulties are:
 * The value of the ``!22`` variable merges into ``%bb1``, but there is no PHI
   node
 
-As mentioned above, the ``llvm.dbg.value`` intrinsics essentially form an
-imperative program embedded in the IR, with each intrinsic defining a variable
+As mentioned above, the ``#dbg_value`` records essentially form an
+imperative program embedded in the IR, with each record defining a variable
 location. This *could* be converted to an SSA form by mem2reg, in the same way
 that it uses use-def chains to identify control flow merges and insert phi
 nodes for IR Values. However, because debug variable locations are defined for
 every machine instruction, in effect every IR instruction uses every variable
-location, which would lead to a large number of debugging intrinsics being
+location, which would lead to a large number of debugging records being
 generated.
 
 Examining the example above, variable ``!30`` is assigned ``%input`` on both
@@ -934,8 +962,8 @@ constant values on either path. Where control flow merges in ``%bb1`` we would
 want ``!30`` to keep its location (``%input``), but ``!22`` to become undefined
 as we cannot determine at runtime what value it should have in %bb1 without
 inserting a PHI node. mem2reg does not insert the PHI node to avoid changing
-codegen when debugging is enabled, and does not insert the other dbg.values
-to avoid adding very large numbers of intrinsics.
+codegen when debugging is enabled, and does not insert the other #dbg_values
+to avoid adding very large numbers of records.
 
 Instead, LiveDebugValues determines variable locations when control
 flow merges. A dataflow analysis is used to propagate locations between blocks:
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
index e24feb3bf5fa24..6604e240419053 100644
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -741,16 +741,16 @@ RUN lines:
 
    Expands to the path separator, i.e. ``:`` (or ``;`` on Windows).
 
-``${fs-src-root}``
+``%{fs-src-root}``
    Expands to the root component of file system paths for the source directory,
    i.e. ``/`` on Unix systems or ``C:\`` (or another drive) on Windows.
 
-``${fs-tmp-root}``
+``%{fs-tmp-root}``
    Expands to the root component of file system paths for the test's temporary
    directory, i.e. ``/`` on Unix systems or ``C:\`` (or another drive) on
    Windows.
 
-``${fs-sep}``
+``%{fs-sep}``
    Expands to the file system separator, i.e. ``/`` or ``\`` on Windows.
 
 ``%/s, %/S, %/t, %/T``
diff --git a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
index d9f11dd6d77900..c75d8814918aaa 100644
--- a/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
+++ b/llvm/docs/tutorial/MyFirstLanguageFrontend/LangImpl09.rst
@@ -416,7 +416,7 @@ argument allocas in ``FunctionAST::codegen``.
 
 Here we're first creating the variable, giving it the scope (``SP``),
 the name, source location, type, and since it's an argument, the argument
-index. Next, we create an ``lvm.dbg.declare`` call to indicate at the IR
+index. Next, we create a ``#dbg_declare`` record to indicate at the IR
 level that we've got a variable in an alloca (and it gives a starting
 location for the variable), and setting a source location for the
 beginning of the scope on the declare.
diff --git a/llvm/include/llvm/ADT/APFloat.h b/llvm/include/llvm/ADT/APFloat.h
index 78faadb30d9eb5..a9bb6cc9999b1e 100644
--- a/llvm/include/llvm/ADT/APFloat.h
+++ b/llvm/include/llvm/ADT/APFloat.h
@@ -189,6 +189,14 @@ struct APFloatBase {
     // improved range compared to half (16-bit) formats, at (potentially)
     // greater throughput than single precision (32-bit) formats.
     S_FloatTF32,
+    // 6-bit floating point number with bit layout S1E3M2. Unlike IEEE-754
+    // types, there are no infinity or NaN values. The format is detailed in
+    // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    S_Float6E3M2FN,
+    // 6-bit floating point number with bit layout S1E2M3. Unlike IEEE-754
+    // types, there are no infinity or NaN values. The format is detailed in
+    // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    S_Float6E2M3FN,
 
     S_x87DoubleExtended,
     S_MaxSemantics = S_x87DoubleExtended,
@@ -209,6 +217,8 @@ struct APFloatBase {
   static const fltSemantics &Float8E4M3FNUZ() LLVM_READNONE;
   static const fltSemantics &Float8E4M3B11FNUZ() LLVM_READNONE;
   static const fltSemantics &FloatTF32() LLVM_READNONE;
+  static const fltSemantics &Float6E3M2FN() LLVM_READNONE;
+  static const fltSemantics &Float6E2M3FN() LLVM_READNONE;
   static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
@@ -627,6 +637,8 @@ class IEEEFloat final : public APFloatBase {
   APInt convertFloat8E4M3FNUZAPFloatToAPInt() const;
   APInt convertFloat8E4M3B11FNUZAPFloatToAPInt() const;
   APInt convertFloatTF32APFloatToAPInt() const;
+  APInt convertFloat6E3M2FNAPFloatToAPInt() const;
+  APInt convertFloat6E2M3FNAPFloatToAPInt() const;
   void initFromAPInt(const fltSemantics *Sem, const APInt &api);
   template <const fltSemantics &S> void initFromIEEEAPInt(const APInt &api);
   void initFromHalfAPInt(const APInt &api);
@@ -642,6 +654,8 @@ class IEEEFloat final : public APFloatBase {
   void initFromFloat8E4M3FNUZAPInt(const APInt &api);
   void initFromFloat8E4M3B11FNUZAPInt(const APInt &api);
   void initFromFloatTF32APInt(const APInt &api);
+  void initFromFloat6E3M2FNAPInt(const APInt &api);
+  void initFromFloat6E2M3FNAPInt(const APInt &api);
 
   void assign(const IEEEFloat &);
   void copySignificand(const IEEEFloat &);
@@ -1046,6 +1060,17 @@ class APFloat : public APFloatBase {
   /// \param Semantics - type float semantics
   static APFloat getAllOnesValue(const fltSemantics &Semantics);
 
+  static bool hasNanOrInf(const fltSemantics &Sem) {
+    switch (SemanticsToEnum(Sem)) {
+    default:
+      return true;
+    // Below Semantics do not support {NaN or Inf}
+    case APFloat::S_Float6E3M2FN:
+    case APFloat::S_Float6E2M3FN:
+      return false;
+    }
+  }
+
   /// Used to insert APFloat objects, or objects that contain APFloat objects,
   /// into FoldingSets.
   void Profile(FoldingSetNodeID &NID) const;
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 4f660825bd6e08..b085f086598178 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -337,6 +337,7 @@ namespace llvm {
 
     // Top-Level Entities
     bool parseTopLevelEntities();
+    bool finalizeDebugInfoFormat(Module *M);
     void dropUnknownMetadataReferences();
     bool validateEndOfModule(bool UpgradeDebugInfo);
     bool validateEndOfIndex();
diff --git a/llvm/include/llvm/Bitcode/LLVMBitCodes.h b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
index b71a3b9d7aaed3..9b268c2cc53ea7 100644
--- a/llvm/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/llvm/include/llvm/Bitcode/LLVMBitCodes.h
@@ -758,6 +758,7 @@ enum AttributeKindCodes {
   ATTR_KIND_DEAD_ON_UNWIND = 91,
   ATTR_KIND_SANITIZED_PADDED_GLOBAL = 92,
   ATTR_KIND_RANGE = 93,
+  ATTR_KIND_SANITIZE_NUMERICAL_STABILITY = 94,
 };
 
 enum ComdatSelectionKindCodes {
diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index a3a81625183675..bb409fc04aed5d 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -38,6 +38,7 @@ class BasicBlock;
 class BlockAddress;
 class Constant;
 class ConstantArray;
+class ConstantPtrAuth;
 class DataLayout;
 class DIE;
 class DIEAbbrev;
@@ -585,6 +586,10 @@ class AsmPrinter : public MachineFunctionPass {
     emitGlobalConstant(DL, CV);
   }
 
+  virtual const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) {
+    report_fatal_error("ptrauth constant lowering not implemented");
+  }
+
   /// Return true if the basic block has exactly one predecessor and the control
   /// transfer mechanism between the predecessor and this block is a
   /// fall-through.
diff --git a/llvm/include/llvm/CodeGen/MachineDominators.h b/llvm/include/llvm/CodeGen/MachineDominators.h
index 30c18ef410fab3..7c7df8ca4c3810 100644
--- a/llvm/include/llvm/CodeGen/MachineDominators.h
+++ b/llvm/include/llvm/CodeGen/MachineDominators.h
@@ -24,6 +24,7 @@
 #include "llvm/Support/GenericDomTreeConstruction.h"
 #include <cassert>
 #include <memory>
+#include <optional>
 
 namespace llvm {
 class AnalysisUsage;
@@ -39,16 +40,39 @@ inline void DominatorTreeBase<MachineBasicBlock, false>::addRoot(
 
 extern template class DomTreeNodeBase<MachineBasicBlock>;
 extern template class DominatorTreeBase<MachineBasicBlock, false>; // DomTree
-extern template class DominatorTreeBase<MachineBasicBlock, true>; // PostDomTree
 
-using MachineDomTree = DomTreeBase<MachineBasicBlock>;
 using MachineDomTreeNode = DomTreeNodeBase<MachineBasicBlock>;
 
+namespace DomTreeBuilder {
+using MBBDomTree = DomTreeBase<MachineBasicBlock>;
+using MBBUpdates = ArrayRef<llvm::cfg::Update<MachineBasicBlock *>>;
+using MBBDomTreeGraphDiff = GraphDiff<MachineBasicBlock *, false>;
+
+extern template void Calculate<MBBDomTree>(MBBDomTree &DT);
+extern template void CalculateWithUpdates<MBBDomTree>(MBBDomTree &DT,
+                                                      MBBUpdates U);
+
+extern template void InsertEdge<MBBDomTree>(MBBDomTree &DT,
+                                            MachineBasicBlock *From,
+                                            MachineBasicBlock *To);
+
+extern template void DeleteEdge<MBBDomTree>(MBBDomTree &DT,
+                                            MachineBasicBlock *From,
+                                            MachineBasicBlock *To);
+
+extern template void ApplyUpdates<MBBDomTree>(MBBDomTree &DT,
+                                              MBBDomTreeGraphDiff &,
+                                              MBBDomTreeGraphDiff *);
+
+extern template bool Verify<MBBDomTree>(const MBBDomTree &DT,
+                                        MBBDomTree::VerificationLevel VL);
+} // namespace DomTreeBuilder
+
 //===-------------------------------------
 /// DominatorTree Class - Concrete subclass of DominatorTreeBase that is used to
 /// compute a normal dominator tree.
 ///
-class MachineDominatorTree : public MachineFunctionPass {
+class MachineDominatorTree : public DomTreeBase<MachineBasicBlock> {
   /// Helper structure used to hold all the basic blocks
   /// involved in the split of a critical edge.
   struct CriticalEdge {
@@ -70,62 +94,55 @@ class MachineDominatorTree : public MachineFunctionPass {
   /// such as BB == elt.NewBB.
   mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
 
-  /// The DominatorTreeBase that is used to compute a normal dominator tree.
-  std::unique_ptr<MachineDomTree> DT;
-
   /// Apply all the recorded critical edges to the DT.
   /// This updates the underlying DT information in a way that uses
   /// the fast query path of DT as much as possible.
+  /// FIXME: This method should not be a const member!
   ///
   /// \post CriticalEdgesToSplit.empty().
   void applySplitCriticalEdges() const;
 
 public:
-  static char ID; // Pass ID, replacement for typeid
+  using Base = DomTreeBase<MachineBasicBlock>;
 
-  MachineDominatorTree();
-  explicit MachineDominatorTree(MachineFunction &MF) : MachineFunctionPass(ID) {
-    calculate(MF);
-  }
+  MachineDominatorTree() = default;
+  explicit MachineDominatorTree(MachineFunction &MF) { calculate(MF); }
 
-  MachineDomTree &getBase() {
-    if (!DT)
-      DT.reset(new MachineDomTree());
+  // FIXME: If there is an updater for MachineDominatorTree,
+  // migrate to this updater and remove these wrappers.
+
+  MachineDominatorTree &getBase() {
     applySplitCriticalEdges();
-    return *DT;
+    return *this;
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-
   MachineBasicBlock *getRoot() const {
     applySplitCriticalEdges();
-    return DT->getRoot();
+    return Base::getRoot();
   }
 
   MachineDomTreeNode *getRootNode() const {
     applySplitCriticalEdges();
-    return DT->getRootNode();
+    return const_cast<MachineDomTreeNode *>(Base::getRootNode());
   }
 
-  bool runOnMachineFunction(MachineFunction &F) override;
-
   void calculate(MachineFunction &F);
 
   bool dominates(const MachineDomTreeNode *A,
                  const MachineDomTreeNode *B) const {
     applySplitCriticalEdges();
-    return DT->dominates(A, B);
+    return Base::dominates(A, B);
   }
 
   void getDescendants(MachineBasicBlock *A,
                       SmallVectorImpl<MachineBasicBlock *> &Result) {
     applySplitCriticalEdges();
-    DT->getDescendants(A, Result);
+    Base::getDescendants(A, Result);
   }
 
   bool dominates(const MachineBasicBlock *A, const MachineBasicBlock *B) const {
     applySplitCriticalEdges();
-    return DT->dominates(A, B);
+    return Base::dominates(A, B);
   }
 
   // dominates - Return true if A dominates B. This performs the
@@ -133,7 +150,8 @@ class MachineDominatorTree : public MachineFunctionPass {
   bool dominates(const MachineInstr *A, const MachineInstr *B) const {
     applySplitCriticalEdges();
     const MachineBasicBlock *BBA = A->getParent(), *BBB = B->getParent();
-    if (BBA != BBB) return DT->dominates(BBA, BBB);
+    if (BBA != BBB)
+      return Base::dominates(BBA, BBB);
 
     // Loop through the basic block until we find A or B.
     MachineBasicBlock::const_iterator I = BBA->begin();
@@ -146,13 +164,13 @@ class MachineDominatorTree : public MachineFunctionPass {
   bool properlyDominates(const MachineDomTreeNode *A,
                          const MachineDomTreeNode *B) const {
     applySplitCriticalEdges();
-    return DT->properlyDominates(A, B);
+    return Base::properlyDominates(A, B);
   }
 
   bool properlyDominates(const MachineBasicBlock *A,
                          const MachineBasicBlock *B) const {
     applySplitCriticalEdges();
-    return DT->properlyDominates(A, B);
+    return Base::properlyDominates(A, B);
   }
 
   /// findNearestCommonDominator - Find nearest common dominator basic block
@@ -160,12 +178,12 @@ class MachineDominatorTree : public MachineFunctionPass {
   MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
                                                 MachineBasicBlock *B) {
     applySplitCriticalEdges();
-    return DT->findNearestCommonDominator(A, B);
+    return Base::findNearestCommonDominator(A, B);
   }
 
   MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
     applySplitCriticalEdges();
-    return DT->getNode(BB);
+    return Base::getNode(BB);
   }
 
   /// getNode - return the (Post)DominatorTree node for the specified basic
@@ -173,7 +191,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   ///
   MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
     applySplitCriticalEdges();
-    return DT->getNode(BB);
+    return Base::getNode(BB);
   }
 
   /// addNewBlock - Add a new node to the dominator tree information.  This
@@ -182,7 +200,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   MachineDomTreeNode *addNewBlock(MachineBasicBlock *BB,
                                   MachineBasicBlock *DomBB) {
     applySplitCriticalEdges();
-    return DT->addNewBlock(BB, DomBB);
+    return Base::addNewBlock(BB, DomBB);
   }
 
   /// changeImmediateDominator - This method is used to update the dominator
@@ -191,13 +209,13 @@ class MachineDominatorTree : public MachineFunctionPass {
   void changeImmediateDominator(MachineBasicBlock *N,
                                 MachineBasicBlock *NewIDom) {
     applySplitCriticalEdges();
-    DT->changeImmediateDominator(N, NewIDom);
+    Base::changeImmediateDominator(N, NewIDom);
   }
 
   void changeImmediateDominator(MachineDomTreeNode *N,
                                 MachineDomTreeNode *NewIDom) {
     applySplitCriticalEdges();
-    DT->changeImmediateDominator(N, NewIDom);
+    Base::changeImmediateDominator(N, NewIDom);
   }
 
   /// eraseNode - Removes a node from  the dominator tree. Block must not
@@ -205,29 +223,23 @@ class MachineDominatorTree : public MachineFunctionPass {
   /// children list. Deletes dominator node associated with basic block BB.
   void eraseNode(MachineBasicBlock *BB) {
     applySplitCriticalEdges();
-    DT->eraseNode(BB);
+    Base::eraseNode(BB);
   }
 
   /// splitBlock - BB is split and now it has one successor. Update dominator
   /// tree to reflect this change.
   void splitBlock(MachineBasicBlock* NewBB) {
     applySplitCriticalEdges();
-    DT->splitBlock(NewBB);
+    Base::splitBlock(NewBB);
   }
 
   /// isReachableFromEntry - Return true if A is dominated by the entry
   /// block of the function containing it.
   bool isReachableFromEntry(const MachineBasicBlock *A) {
     applySplitCriticalEdges();
-    return DT->isReachableFromEntry(A);
+    return Base::isReachableFromEntry(A);
   }
 
-  void releaseMemory() override;
-
-  void verifyAnalysis() const override;
-
-  void print(raw_ostream &OS, const Module*) const override;
-
   /// Record that the critical edge (FromBB, ToBB) has been
   /// split with NewBB.
   /// This is best to use this method instead of directly update the
@@ -251,6 +263,34 @@ class MachineDominatorTree : public MachineFunctionPass {
   }
 };
 
+/// \brief Analysis pass which computes a \c MachineDominatorTree.
+class MachineDominatorTreeWrapperPass : public MachineFunctionPass {
+  // MachineFunctionPass may verify the analysis result without running pass,
+  // e.g. when `F.hasAvailableExternallyLinkage` is true.
+  std::optional<MachineDominatorTree> DT;
+
+public:
+  static char ID;
+
+  MachineDominatorTreeWrapperPass();
+
+  MachineDominatorTree &getDomTree() { return *DT; }
+  const MachineDominatorTree &getDomTree() const { return *DT; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void verifyAnalysis() const override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  void releaseMemory() override;
+
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+};
+
 //===-------------------------------------
 /// DominatorTree GraphTraits specialization so the DominatorTree can be
 /// iterable by generic graph iterators.
diff --git a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
index 1039ac4e5189b3..a9b5eaf41c3f8d 100644
--- a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
@@ -30,7 +30,7 @@ using MachineUniformityInfo = GenericUniformityInfo<MachineSSAContext>;
 /// everything is uniform.
 MachineUniformityInfo computeMachineUniformityInfo(
     MachineFunction &F, const MachineCycleInfo &cycleInfo,
-    const MachineDomTree &domTree, bool HasBranchDivergence);
+    const MachineDominatorTree &domTree, bool HasBranchDivergence);
 
 /// Legacy analysis pass which computes a \ref MachineUniformityInfo.
 class MachineUniformityAnalysisPass : public MachineFunctionPass {
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 2dcd7805b6c96b..dbb658940eef12 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -61,7 +61,7 @@ class DWARFFormValue {
   const DWARFUnit *U = nullptr; /// Remember the DWARFUnit at extract time.
   const DWARFContext *C = nullptr; /// Context for extract time.
 
-  DWARFFormValue(dwarf::Form F, ValueType V) : Form(F), Value(V) {}
+  DWARFFormValue(dwarf::Form F, const ValueType &V) : Form(F), Value(V) {}
 
 public:
   DWARFFormValue(dwarf::Form F = dwarf::Form(0)) : Form(F) {}
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
index 331d34d6f2673f..9509c537ffca95 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h
@@ -198,7 +198,7 @@ struct TargetRegionEntryInfo {
                                          unsigned DeviceID, unsigned FileID,
                                          unsigned Line, unsigned Count);
 
-  bool operator<(const TargetRegionEntryInfo RHS) const {
+  bool operator<(const TargetRegionEntryInfo &RHS) const {
     return std::make_tuple(ParentName, DeviceID, FileID, Line, Count) <
            std::make_tuple(RHS.ParentName, RHS.DeviceID, RHS.FileID, RHS.Line,
                            RHS.Count);
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index b462ced8194e2c..799a678145ca25 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -288,6 +288,9 @@ def SanitizeMemTag : EnumAttr<"sanitize_memtag", [FnAttr]>;
 /// Attribute to identify global variables instrumented by Sanitizers.
 def SanitizedPaddedGlobal : EnumAttr<"sanitized_padded_global", []>;
 
+/// NumericalStabilitySanitizer is on.
+def SanitizeNumericalStability : EnumAttr<"sanitize_numerical_stability", [FnAttr]>;
+
 /// Speculative Load Hardening is enabled.
 ///
 /// Note that this uses the default compatibility (always compatible during
@@ -375,6 +378,7 @@ def : CompatRule<"isEqual<SanitizeThreadAttr>">;
 def : CompatRule<"isEqual<SanitizeMemoryAttr>">;
 def : CompatRule<"isEqual<SanitizeHWAddressAttr>">;
 def : CompatRule<"isEqual<SanitizeMemTagAttr>">;
+def : CompatRule<"isEqual<SanitizeNumericalStabilityAttr>">;
 def : CompatRule<"isEqual<SafeStackAttr>">;
 def : CompatRule<"isEqual<ShadowCallStackAttr>">;
 def : CompatRule<"isEqual<UseSampleProfileAttr>">;
diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def
index 41aa44de957f93..a7b37c5cb204da 100644
--- a/llvm/include/llvm/IR/ConstrainedOps.def
+++ b/llvm/include/llvm/IR/ConstrainedOps.def
@@ -95,6 +95,7 @@ DAG_FUNCTION(round,           1, 0, experimental_constrained_round,      FROUND)
 DAG_FUNCTION(roundeven,       1, 0, experimental_constrained_roundeven,  FROUNDEVEN)
 DAG_FUNCTION(sin,             1, 1, experimental_constrained_sin,        FSIN)
 DAG_FUNCTION(sqrt,            1, 1, experimental_constrained_sqrt,       FSQRT)
+DAG_FUNCTION(tan,             1, 1, experimental_constrained_tan,        FTAN)
 DAG_FUNCTION(trunc,           1, 0, experimental_constrained_trunc,      FTRUNC)
 
 // This is definition for fmuladd intrinsic function, that is converted into
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index bfdb3d8a034b69..e62696b34c72fe 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1218,6 +1218,10 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_tan  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
   def int_experimental_constrained_pow  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       LLVMMatchType<0>,
diff --git a/llvm/include/llvm/IR/MDBuilder.h b/llvm/include/llvm/IR/MDBuilder.h
index e02ec8f5a3d8bb..3265589b7c8dfa 100644
--- a/llvm/include/llvm/IR/MDBuilder.h
+++ b/llvm/include/llvm/IR/MDBuilder.h
@@ -59,11 +59,7 @@ class MDBuilder {
   //===------------------------------------------------------------------===//
 
   /// Return metadata containing two branch weights.
-  /// @param TrueWeight the weight of the true branch
-  /// @param FalseWeight the weight of the false branch
-  /// @param Do these weights come from __builtin_expect*
-  MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight,
-                              bool IsExpected = false);
+  MDNode *createBranchWeights(uint32_t TrueWeight, uint32_t FalseWeight);
 
   /// Return metadata containing two branch weights, with significant bias
   /// towards `true` destination.
@@ -74,10 +70,7 @@ class MDBuilder {
   MDNode *createUnlikelyBranchWeights();
 
   /// Return metadata containing a number of branch weights.
-  /// @param Weights the weights of all the branches
-  /// @param Do these weights come from __builtin_expect*
-  MDNode *createBranchWeights(ArrayRef<uint32_t> Weights,
-                              bool IsExpected = false);
+  MDNode *createBranchWeights(ArrayRef<uint32_t> Weights);
 
   /// Return metadata specifying that a branch or switch is unpredictable.
   MDNode *createUnpredictable();
diff --git a/llvm/include/llvm/IR/ProfDataUtils.h b/llvm/include/llvm/IR/ProfDataUtils.h
index 1d7c97d9be953e..88fbad4d6b9d82 100644
--- a/llvm/include/llvm/IR/ProfDataUtils.h
+++ b/llvm/include/llvm/IR/ProfDataUtils.h
@@ -55,17 +55,6 @@ MDNode *getBranchWeightMDNode(const Instruction &I);
 /// Nullptr otherwise.
 MDNode *getValidBranchWeightMDNode(const Instruction &I);
 
-/// Check if Branch Weight Metadata has an "expected" field from an llvm.expect*
-/// intrinsic
-bool hasBranchWeightOrigin(const Instruction &I);
-
-/// Check if Branch Weight Metadata has an "expected" field from an llvm.expect*
-/// intrinsic
-bool hasBranchWeightOrigin(const MDNode *ProfileData);
-
-/// Return the offset to the first branch weight data
-unsigned getBranchWeightOffset(const MDNode *ProfileData);
-
 /// Extract branch weights from MD_prof metadata
 ///
 /// \param ProfileData A pointer to an MDNode.
@@ -122,11 +111,7 @@ bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalWeights);
 
 /// Create a new `branch_weights` metadata node and add or overwrite
 /// a `prof` metadata reference to instruction `I`.
-/// \param I the Instruction to set branch weights on.
-/// \param Weights an array of weights to set on instruction I.
-/// \param IsExpected were these weights added from an llvm.expect* intrinsic.
-void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
-                      bool IsExpected);
+void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights);
 
 /// Scaling the profile data attached to 'I' using the ratio of S/T.
 void scaleProfData(Instruction &I, uint64_t S, uint64_t T);
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 3af8e402e5e5da..36933b1f8517be 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -191,7 +191,7 @@ void initializeMachineCopyPropagationPass(PassRegistry&);
 void initializeMachineCycleInfoPrinterPassPass(PassRegistry &);
 void initializeMachineCycleInfoWrapperPassPass(PassRegistry &);
 void initializeMachineDominanceFrontierPass(PassRegistry&);
-void initializeMachineDominatorTreePass(PassRegistry&);
+void initializeMachineDominatorTreeWrapperPassPass(PassRegistry &);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
 void initializeMachineFunctionSplitterPass(PassRegistry &);
 void initializeMachineLateInstrsCleanupPass(PassRegistry&);
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index ed8df35ee173a3..c30a7b27522082 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -255,31 +255,6 @@ class MCContext {
   /// A collection of MCPseudoProbe in the current module
   MCPseudoProbeTable PseudoProbeTable;
 
-  // Sections are differentiated by the quadruple (section_name, group_name,
-  // unique_id, link_to_symbol_name). Sections sharing the same quadruple are
-  // combined into one section.
-  struct ELFSectionKey {
-    std::string SectionName;
-    StringRef GroupName;
-    StringRef LinkedToName;
-    unsigned UniqueID;
-
-    ELFSectionKey(StringRef SectionName, StringRef GroupName,
-                  StringRef LinkedToName, unsigned UniqueID)
-        : SectionName(SectionName), GroupName(GroupName),
-          LinkedToName(LinkedToName), UniqueID(UniqueID) {}
-
-    bool operator<(const ELFSectionKey &Other) const {
-      if (SectionName != Other.SectionName)
-        return SectionName < Other.SectionName;
-      if (GroupName != Other.GroupName)
-        return GroupName < Other.GroupName;
-      if (int O = LinkedToName.compare(Other.LinkedToName))
-        return O < 0;
-      return UniqueID < Other.UniqueID;
-    }
-  };
-
   struct COFFSectionKey {
     std::string SectionName;
     StringRef GroupName;
@@ -353,8 +328,8 @@ class MCContext {
   };
 
   StringMap<MCSectionMachO *> MachOUniquingMap;
-  std::map<ELFSectionKey, MCSectionELF *> ELFUniquingMap;
   std::map<COFFSectionKey, MCSectionCOFF *> COFFUniquingMap;
+  StringMap<MCSectionELF *> ELFUniquingMap;
   std::map<std::string, MCSectionGOFF *> GOFFUniquingMap;
   std::map<WasmSectionKey, MCSectionWasm *> WasmUniquingMap;
   std::map<XCOFFSectionKey, MCSectionXCOFF *> XCOFFUniquingMap;
diff --git a/llvm/include/llvm/MC/MCSection.h b/llvm/include/llvm/MC/MCSection.h
index 90effde5bb670a..217b9b4b5bc52b 100644
--- a/llvm/include/llvm/MC/MCSection.h
+++ b/llvm/include/llvm/MC/MCSection.h
@@ -63,9 +63,6 @@ class MCSection {
   using const_iterator = FragmentListType::const_iterator;
   using iterator = FragmentListType::iterator;
 
-  using const_reverse_iterator = FragmentListType::const_reverse_iterator;
-  using reverse_iterator = FragmentListType::reverse_iterator;
-
 private:
   MCSymbol *Begin;
   MCSymbol *End = nullptr;
@@ -192,6 +189,7 @@ class MCSection {
 
   iterator end() { return Fragments.end(); }
   const_iterator end() const { return Fragments.end(); }
+  bool empty() const { return Fragments.empty(); }
 
   void addFragment(MCFragment &F) { Fragments.push_back(&F); }
 
diff --git a/llvm/include/llvm/ProfileData/MemProf.h b/llvm/include/llvm/ProfileData/MemProf.h
index 53ddfd19234108..a9f7d7c3fda885 100644
--- a/llvm/include/llvm/ProfileData/MemProf.h
+++ b/llvm/include/llvm/ProfileData/MemProf.h
@@ -988,7 +988,7 @@ computeFrameHistogram(llvm::MapVector<CallStackId, llvm::SmallVector<FrameId>>
 //
 // The radix tree allows us to reconstruct call stacks in the leaf-to-root
 // order as we scan the array from left ro right while following pointers to
-// parents along the way
+// parents along the way.
 //
 // For example, if we are decoding CallStackId 2, we start a forward traversal
 // at Index 7, noting the call stack length of 4 and obtaining f5 and f4.  When
diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index f0e4ee534ece38..adb5ba6e3dcc2f 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -635,6 +635,9 @@ std::enable_if_t<std::is_signed_v<T>, T> SubOverflow(T X, T Y, T &Result) {
 /// result, returning true if an overflow ocurred.
 template <typename T>
 std::enable_if_t<std::is_signed_v<T>, T> MulOverflow(T X, T Y, T &Result) {
+#if __has_builtin(__builtin_mul_overflow)
+  return __builtin_mul_overflow(X, Y, &Result);
+#else
   // Perform the unsigned multiplication on absolute values.
   using U = std::make_unsigned_t<T>;
   const U UX = X < 0 ? (0 - static_cast<U>(X)) : static_cast<U>(X);
@@ -656,6 +659,7 @@ std::enable_if_t<std::is_signed_v<T>, T> MulOverflow(T X, T Y, T &Result) {
     return UX > (static_cast<U>(std::numeric_limits<T>::max()) + U(1)) / UY;
   else
     return UX > (static_cast<U>(std::numeric_limits<T>::max())) / UY;
+#endif
 }
 
 } // namespace llvm
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index dcb00f6b0249fb..b7c1216d16e5ea 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -132,6 +132,48 @@ struct ExtensionInfo {
 #define EMIT_EXTENSIONS
 #include "llvm/TargetParser/AArch64TargetParserDef.inc"
 
+struct ExtensionSet {
+  // Set of extensions which are currently enabled.
+  ExtensionBitset Enabled;
+  // Set of extensions which have been enabled or disabled at any point. Used
+  // to avoid cluttering the cc1 command-line with lots of unneeded features.
+  ExtensionBitset Touched;
+  // Base architecture version, which we need to know because some feature
+  // dependencies change depending on this.
+  const ArchInfo *BaseArch;
+
+  ExtensionSet() : Enabled(), Touched(), BaseArch(nullptr) {}
+
+  // Enable the given architecture extension, and any other extensions it
+  // depends on. Does not change the base architecture, or follow dependencies
+  // between features which are only related by required arcitecture versions.
+  void enable(ArchExtKind E);
+
+  // Disable the given architecture extension, and any other extensions which
+  // depend on it. Does not change the base architecture, or follow
+  // dependencies between features which are only related by required
+  // arcitecture versions.
+  void disable(ArchExtKind E);
+
+  // Add default extensions for the given CPU. Records the base architecture,
+  // to later resolve dependencies which depend on it.
+  void addCPUDefaults(const CpuInfo &CPU);
+
+  // Add default extensions for the given architecture version. Records the
+  // base architecture, to later resolve dependencies which depend on it.
+  void addArchDefaults(const ArchInfo &Arch);
+
+  // Add or remove a feature based on a modifier string. The string must be of
+  // the form "<name>" to enable a feature or "no<name>" to disable it. This
+  // will also enable or disable any features as required by the dependencies
+  // between them.
+  bool parseModifier(StringRef Modifier);
+
+  // Convert the set of enabled extension to an LLVM feature list, appending
+  // them to Features.
+  void toLLVMFeatureList(std::vector<StringRef> &Features) const;
+};
+
 // Represents a dependency between two architecture extensions. Later is the
 // feature which was added to the architecture after Earlier, and expands the
 // functionality provided by it. If Later is enabled, then Earlier will also be
@@ -542,65 +584,6 @@ inline constexpr CpuInfo CpuInfos[] = {
                                 AArch64::AEK_PROFILE}))},
 };
 
-struct ExtensionSet {
-  // Set of extensions which are currently enabled.
-  ExtensionBitset Enabled;
-  // Set of extensions which have been enabled or disabled at any point. Used
-  // to avoid cluttering the cc1 command-line with lots of unneeded features.
-  ExtensionBitset Touched;
-  // Base architecture version, which we need to know because some feature
-  // dependencies change depending on this.
-  const ArchInfo *BaseArch;
-
-  ExtensionSet() : Enabled(), Touched(), BaseArch(nullptr) {}
-
-  // Enable the given architecture extension, and any other extensions it
-  // depends on. Does not change the base architecture, or follow dependencies
-  // between features which are only related by required arcitecture versions.
-  void enable(ArchExtKind E);
-
-  // Disable the given architecture extension, and any other extensions which
-  // depend on it. Does not change the base architecture, or follow
-  // dependencies between features which are only related by required
-  // arcitecture versions.
-  void disable(ArchExtKind E);
-
-  // Add default extensions for the given CPU. Records the base architecture,
-  // to later resolve dependencies which depend on it.
-  void addCPUDefaults(const CpuInfo &CPU);
-
-  // Add default extensions for the given architecture version. Records the
-  // base architecture, to later resolve dependencies which depend on it.
-  void addArchDefaults(const ArchInfo &Arch);
-
-  // Add or remove a feature based on a modifier string. The string must be of
-  // the form "<name>" to enable a feature or "no<name>" to disable it. This
-  // will also enable or disable any features as required by the dependencies
-  // between them.
-  bool parseModifier(StringRef Modifier, const bool AllowNoDashForm = false);
-
-  // Constructs a new ExtensionSet by toggling the corresponding bits for every
-  // feature in the \p Features list without expanding their dependencies. Used
-  // for reconstructing an ExtensionSet from the output of toLLVMFeatures().
-  void reconstructFromParsedFeatures(const std::vector<std::string> &Features);
-
-  // Convert the set of enabled extension to an LLVM feature list, appending
-  // them to Features.
-  template <typename T> void toLLVMFeatureList(std::vector<T> &Features) const {
-    if (BaseArch && !BaseArch->ArchFeature.empty())
-      Features.emplace_back(T(BaseArch->ArchFeature));
-
-    for (const auto &E : Extensions) {
-      if (E.Feature.empty() || !Touched.test(E.ID))
-        continue;
-      if (Enabled.test(E.ID))
-        Features.emplace_back(T(E.Feature));
-      else
-        Features.emplace_back(T(E.NegFeature));
-    }
-  }
-};
-
 // Name alias.
 struct Alias {
   StringRef AltName;
@@ -624,13 +607,7 @@ const ArchInfo *getArchForCpu(StringRef CPU);
 
 // Parser
 const ArchInfo *parseArch(StringRef Arch);
-
-// Return the extension which has the given -target-feature name.
-std::optional<ExtensionInfo> targetFeatureToExtension(StringRef TargetFeature);
-
-// Parse a name as defined by the Extension class in tablegen.
 std::optional<ExtensionInfo> parseArchExtension(StringRef Extension);
-
 // Given the name of a CPU or alias, return the correponding CpuInfo.
 std::optional<CpuInfo> parseCpu(StringRef Name);
 // Used by target parser tests
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index 8daa8a689c95f5..f10cede0734e44 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -250,8 +250,8 @@ X86_FEATURE_COMPAT(SHA512,          "sha512",                 0)
 X86_FEATURE_COMPAT(SM4,             "sm4",                    0)
 X86_FEATURE       (EGPR,            "egpr")
 X86_FEATURE_COMPAT(USERMSR,         "usermsr",                0)
-X86_FEATURE_COMPAT(AVX10_1,         "avx10.1-256",            0)
-X86_FEATURE_COMPAT(AVX10_1_512,     "avx10.1-512",            0)
+X86_FEATURE_COMPAT(AVX10_1,         "avx10.1-256",           36)
+X86_FEATURE_COMPAT(AVX10_1_512,     "avx10.1-512",           37)
 // These features aren't really CPU features, but the frontend can set them.
 X86_FEATURE       (RETPOLINE_EXTERNAL_THUNK,    "retpoline-external-thunk")
 X86_FEATURE       (RETPOLINE_INDIRECT_BRANCHES, "retpoline-indirect-branches")
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 3ca3ae951fcd78..e0f5bf0ab83658 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -866,8 +866,6 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
                                   ArrayRef<Constant *> Ops,
                                   const DataLayout &DL,
                                   const TargetLibraryInfo *TLI) {
-  bool InBounds = GEP->isInBounds();
-
   Type *SrcElemTy = GEP->getSourceElementType();
   Type *ResTy = GEP->getType();
   if (!SrcElemTy->isSized() || isa<ScalableVectorType>(SrcElemTy))
@@ -898,8 +896,10 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
     InRange = InRange->sextOrTrunc(BitWidth);
 
   // If this is a GEP of a GEP, fold it all into a single GEP.
+  GEPNoWrapFlags NW = GEP->getNoWrapFlags();
+  bool Overflow = false;
   while (auto *GEP = dyn_cast<GEPOperator>(Ptr)) {
-    InBounds &= GEP->isInBounds();
+    NW &= GEP->getNoWrapFlags();
 
     SmallVector<Value *, 4> NestedOps(llvm::drop_begin(GEP->operands()));
 
@@ -923,9 +923,16 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
 
     Ptr = cast<Constant>(GEP->getOperand(0));
     SrcElemTy = GEP->getSourceElementType();
-    Offset += APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps));
+    Offset = Offset.sadd_ov(
+        APInt(BitWidth, DL.getIndexedOffsetInType(SrcElemTy, NestedOps)),
+        Overflow);
   }
 
+  // Preserving nusw (without inbounds) also requires that the offset
+  // additions did not overflow.
+  if (NW.hasNoUnsignedSignedWrap() && !NW.isInBounds() && Overflow)
+    NW = NW.withoutNoUnsignedSignedWrap();
+
   // If the base value for this address is a literal integer value, fold the
   // getelementptr to the resulting integer value casted to the pointer type.
   APInt BasePtr(BitWidth, 0);
@@ -944,17 +951,19 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   }
 
   // Try to infer inbounds for GEPs of globals.
-  if (!InBounds && Offset.isNonNegative()) {
+  // TODO(gep_nowrap): Also infer nuw flag.
+  if (!NW.isInBounds() && Offset.isNonNegative()) {
     bool CanBeNull, CanBeFreed;
     uint64_t DerefBytes =
         Ptr->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
-    InBounds = DerefBytes != 0 && !CanBeNull && Offset.sle(DerefBytes);
+    if (DerefBytes != 0 && !CanBeNull && Offset.sle(DerefBytes))
+      NW |= GEPNoWrapFlags::inBounds();
   }
 
   // Otherwise canonicalize this to a single ptradd.
   LLVMContext &Ctx = Ptr->getContext();
   return ConstantExpr::getGetElementPtr(Type::getInt8Ty(Ctx), Ptr,
-                                        ConstantInt::get(Ctx, Offset), InBounds,
+                                        ConstantInt::get(Ctx, Offset), NW,
                                         InRange);
 }
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 10ee8f4be049c7..c473e29f313807 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -74,6 +74,23 @@ static std::string getTypeString(Type *T) {
   return Tmp.str();
 }
 
+// Whatever debug info format we parsed, we should convert to the expected debug
+// info format immediately afterwards.
+bool LLParser::finalizeDebugInfoFormat(Module *M) {
+  // We should have already returned an error if we observed both intrinsics and
+  // records in this IR.
+  assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) &&
+         "Mixed debug intrinsics/records seen without a parsing error?");
+  if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) {
+    UseNewDbgInfoFormat = SeenNewDbgInfoFormat;
+    WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat;
+    WriteNewDbgInfoFormat = SeenNewDbgInfoFormat;
+  } else if (M) {
+    M->setIsNewDbgInfoFormat(false);
+  }
+  return false;
+}
+
 /// Run: module ::= toplevelentity*
 bool LLParser::Run(bool UpgradeDebugInfo,
                    DataLayoutCallbackTy DataLayoutCallback) {
@@ -91,7 +108,7 @@ bool LLParser::Run(bool UpgradeDebugInfo,
   }
 
   return parseTopLevelEntities() || validateEndOfModule(UpgradeDebugInfo) ||
-         validateEndOfIndex();
+         validateEndOfIndex() || finalizeDebugInfoFormat(M);
 }
 
 bool LLParser::parseStandaloneConstantValue(Constant *&C,
@@ -190,18 +207,6 @@ void LLParser::dropUnknownMetadataReferences() {
 bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   if (!M)
     return false;
-
-  // We should have already returned an error if we observed both intrinsics and
-  // records in this IR.
-  assert(!(SeenNewDbgInfoFormat && SeenOldDbgInfoFormat) &&
-         "Mixed debug intrinsics/records seen without a parsing error?");
-  if (PreserveInputDbgFormat == cl::boolOrDefault::BOU_TRUE) {
-    UseNewDbgInfoFormat = SeenNewDbgInfoFormat;
-    WriteNewDbgInfoFormatToBitcode = SeenNewDbgInfoFormat;
-    WriteNewDbgInfoFormat = SeenNewDbgInfoFormat;
-    M->setNewDbgInfoFormatFlag(SeenNewDbgInfoFormat);
-  }
-
   // Handle any function attribute group forward references.
   for (const auto &RAG : ForwardRefAttrGroups) {
     Value *V = RAG.first;
@@ -434,9 +439,6 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeModuleFlags(*M);
   UpgradeSectionAttributes(*M);
 
-  if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE)
-    M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat);
-
   if (!Slots)
     return false;
   // Initialize the slot mapping.
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index ca75a5184f9712..38c9dedbf4ebaa 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -2130,6 +2130,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::SanitizeThread;
   case bitc::ATTR_KIND_SANITIZE_MEMORY:
     return Attribute::SanitizeMemory;
+  case bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY:
+    return Attribute::SanitizeNumericalStability;
   case bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING:
     return Attribute::SpeculativeLoadHardening;
   case bitc::ATTR_KIND_SWIFT_ERROR:
@@ -4357,7 +4359,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
   if (PreserveInputDbgFormat != cl::boolOrDefault::BOU_TRUE) {
     TheModule->IsNewDbgInfoFormat =
         UseNewDbgInfoFormat &&
-        LoadBitcodeIntoNewDbgInfoFormat != cl::boolOrDefault::BOU_FALSE;
+        LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_TRUE;
   }
 
   this->ValueTypeCallback = std::move(Callbacks.ValueType);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index b611124bf269d1..d1e2c05e843eb1 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -840,6 +840,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_SANITIZE_MEMORY;
   case Attribute::SanitizedPaddedGlobal:
     return bitc::ATTR_KIND_SANITIZED_PADDED_GLOBAL;
+  case Attribute::SanitizeNumericalStability:
+    return bitc::ATTR_KIND_SANITIZE_NUMERICAL_STABILITY;
   case Attribute::SpeculativeLoadHardening:
     return bitc::ATTR_KIND_SPECULATIVE_LOAD_HARDENING;
   case Attribute::SwiftError:
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index fa241da1b0f064..a5cbc897321076 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1779,7 +1779,8 @@ void AsmPrinter::emitFunctionBody() {
 
   if (isVerbose()) {
     // Get MachineDominatorTree or compute it on the fly if it's unavailable
-    MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+    auto MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+    MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
     if (!MDT) {
       OwnedMDT = std::make_unique<MachineDominatorTree>();
       OwnedMDT->getBase().recalculate(*MF);
@@ -3228,6 +3229,9 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
     return MCConstantExpr::create(CI->getZExtValue(), Ctx);
 
+  if (const ConstantPtrAuth *CPA = dyn_cast<ConstantPtrAuth>(CV))
+    return lowerConstantPtrAuth(*CPA);
+
   if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
     return MCSymbolRefExpr::create(getSymbol(GV), Ctx);
 
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 544f1b7f593531..b9093208aad588 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -80,7 +80,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineCopyPropagationPass(Registry);
   initializeMachineCycleInfoPrinterPassPass(Registry);
   initializeMachineCycleInfoWrapperPassPass(Registry);
-  initializeMachineDominatorTreePass(Registry);
+  initializeMachineDominatorTreeWrapperPassPass(Registry);
   initializeMachineFunctionPrinterPassPass(Registry);
   initializeMachineLateInstrsCleanupPass(Registry);
   initializeMachineLICMPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 0e01080bd75cc9..339a1f1f2f002e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -8866,8 +8866,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, ModifyDT &ModifiedDT) {
         scaleWeights(NewTrueWeight, NewFalseWeight);
         Br1->setMetadata(LLVMContext::MD_prof,
                          MDBuilder(Br1->getContext())
-                             .createBranchWeights(TrueWeight, FalseWeight,
-                                                  hasBranchWeightOrigin(*Br1)));
+                             .createBranchWeights(TrueWeight, FalseWeight));
 
         NewTrueWeight = TrueWeight;
         NewFalseWeight = 2 * FalseWeight;
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 2a7bee1618deb2..30480e598acef8 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -790,15 +790,15 @@ char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
 INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
                       "Early If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
                     "Early If Converter", false, false)
 
 void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<MachineTraceMetrics>();
@@ -1089,7 +1089,7 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   TRI = STI.getRegisterInfo();
   SchedModel = STI.getSchedModel();
   MRI = &MF.getRegInfo();
-  DomTree = &getAnalysis<MachineDominatorTree>();
+  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
   MinInstr = nullptr;
@@ -1144,15 +1144,15 @@ char &llvm::EarlyIfPredicatorID = EarlyIfPredicator::ID;
 
 INITIALIZE_PASS_BEGIN(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_END(EarlyIfPredicator, DEBUG_TYPE, "Early If Predicator", false,
                     false)
 
 void EarlyIfPredicator::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -1223,7 +1223,7 @@ bool EarlyIfPredicator::runOnMachineFunction(MachineFunction &MF) {
   TRI = STI.getRegisterInfo();
   MRI = &MF.getRegInfo();
   SchedModel.init(&STI);
-  DomTree = &getAnalysis<MachineDominatorTree>();
+  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
 
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index 69c671220db353..98a0150a84e315 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -135,8 +135,8 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
                    VirtRegMap &vrm)
       : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
         LSS(pass.getAnalysis<LiveStacks>()),
-        MDT(pass.getAnalysis<MachineDominatorTree>()), VRM(vrm),
-        MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
+        MDT(pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
+        VRM(vrm), MRI(mf.getRegInfo()), TII(*mf.getSubtarget().getInstrInfo()),
         TRI(*mf.getSubtarget().getRegisterInfo()),
         MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
         IPA(LIS, mf.getNumBlockIDs()) {}
@@ -192,8 +192,8 @@ class InlineSpiller : public Spiller {
                 VirtRegAuxInfo &VRAI)
       : MF(MF), LIS(Pass.getAnalysis<LiveIntervals>()),
         LSS(Pass.getAnalysis<LiveStacks>()),
-        MDT(Pass.getAnalysis<MachineDominatorTree>()), VRM(VRM),
-        MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
+        MDT(Pass.getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()),
+        VRM(VRM), MRI(MF.getRegInfo()), TII(*MF.getSubtarget().getInstrInfo()),
         TRI(*MF.getSubtarget().getRegisterInfo()),
         MBFI(Pass.getAnalysis<MachineBlockFrequencyInfo>()),
         HSpiller(Pass, MF, VRM), VRAI(VRAI) {}
@@ -1381,7 +1381,7 @@ void HoistSpillHelper::rmRedundantSpills(
   // earlier spill with smaller SlotIndex.
   for (auto *const CurrentSpill : Spills) {
     MachineBasicBlock *Block = CurrentSpill->getParent();
-    MachineDomTreeNode *Node = MDT.getBase().getNode(Block);
+    MachineDomTreeNode *Node = MDT.getNode(Block);
     MachineInstr *PrevSpill = SpillBBToSpill[Node];
     if (PrevSpill) {
       SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
@@ -1389,9 +1389,9 @@ void HoistSpillHelper::rmRedundantSpills(
       MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
       MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
       SpillsToRm.push_back(SpillToRm);
-      SpillBBToSpill[MDT.getBase().getNode(Block)] = SpillToKeep;
+      SpillBBToSpill[MDT.getNode(Block)] = SpillToKeep;
     } else {
-      SpillBBToSpill[MDT.getBase().getNode(Block)] = CurrentSpill;
+      SpillBBToSpill[MDT.getNode(Block)] = CurrentSpill;
     }
   }
   for (auto *const SpillToRm : SpillsToRm)
@@ -1465,7 +1465,7 @@ void HoistSpillHelper::getVisitOrders(
   // Sort the nodes in WorkSet in top-down order and save the nodes
   // in Orders. Orders will be used for hoisting in runHoistSpills.
   unsigned idx = 0;
-  Orders.push_back(MDT.getBase().getNode(Root));
+  Orders.push_back(MDT.getNode(Root));
   do {
     MachineDomTreeNode *Node = Orders[idx++];
     for (MachineDomTreeNode *Child : Node->children()) {
diff --git a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
index 39b44b917d9e37..721b75900c8ef0 100644
--- a/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -64,7 +64,8 @@ LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
 
   auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
   auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
-  auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+  auto *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
   LLVM_DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n");
   LLVM_DEBUG(if (MLI) dbgs() << "LoopInfo is available\n");
 
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 3a59ae7ab06644..16d8e916ce6682 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -78,13 +78,13 @@ char LiveDebugVariables::ID = 0;
 
 INITIALIZE_PASS_BEGIN(LiveDebugVariables, DEBUG_TYPE,
                 "Debug Variable Analysis", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(LiveDebugVariables, DEBUG_TYPE,
                 "Debug Variable Analysis", false, false)
 
 void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequiredTransitive<LiveIntervals>();
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 42c769399a1401..f9162b444e03d8 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -61,7 +61,7 @@ char LiveIntervals::ID = 0;
 char &llvm::LiveIntervalsID = LiveIntervals::ID;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals", "Live Interval Analysis",
                       false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_END(LiveIntervals, "liveintervals",
                 "Live Interval Analysis", false, false)
@@ -123,7 +123,7 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   TRI = MF->getSubtarget().getRegisterInfo();
   TII = MF->getSubtarget().getInstrInfo();
   Indexes = &getAnalysis<SlotIndexes>();
-  DomTree = &getAnalysis<MachineDominatorTree>();
+  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   if (!LICalc)
     LICalc = new LiveIntervalCalc();
diff --git a/llvm/lib/CodeGen/MIRSampleProfile.cpp b/llvm/lib/CodeGen/MIRSampleProfile.cpp
index 6faa1ad1a7790e..138cc567487626 100644
--- a/llvm/lib/CodeGen/MIRSampleProfile.cpp
+++ b/llvm/lib/CodeGen/MIRSampleProfile.cpp
@@ -70,7 +70,7 @@ INITIALIZE_PASS_BEGIN(MIRProfileLoaderPass, DEBUG_TYPE,
                       "Load MIR Sample Profile",
                       /* cfg = */ false, /* is_analysis = */ false)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
@@ -365,7 +365,7 @@ bool MIRProfileLoaderPass::runOnMachineFunction(MachineFunction &MF) {
                     << MF.getFunction().getName() << "\n");
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MIRSampleLoader->setInitVals(
-      &getAnalysis<MachineDominatorTree>(),
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(),
       &getAnalysis<MachinePostDominatorTree>(), &getAnalysis<MachineLoopInfo>(),
       MBFI, &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE());
 
@@ -400,7 +400,7 @@ bool MIRProfileLoaderPass::doInitialization(Module &M) {
 void MIRProfileLoaderPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<MachineBlockFrequencyInfo>();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachinePostDominatorTree>();
   AU.addRequiredTransitive<MachineLoopInfo>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 0bd5f09564ec0c..16505f21f0aadc 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1330,9 +1330,9 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
     LIS->repairIntervalsInRange(this, getFirstTerminator(), end(), UsedRegs);
   }
 
-  if (MachineDominatorTree *MDT =
-          P.getAnalysisIfAvailable<MachineDominatorTree>())
-    MDT->recordSplitCriticalEdge(this, Succ, NMBB);
+  if (auto *MDTWrapper =
+          P.getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>())
+    MDTWrapper->getDomTree().recordSplitCriticalEdge(this, Succ, NMBB);
 
   if (MachineLoopInfo *MLI = P.getAnalysisIfAvailable<MachineLoopInfo>())
     if (MachineLoop *TIL = MLI->getLoopFor(this)) {
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 42cdcaa5bbf4f2..4e6101f8755897 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -92,8 +92,8 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AAResultsWrapperPass>();
       AU.addPreservedID(MachineLoopInfoID);
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addPreserved<MachineBlockFrequencyInfo>();
     }
@@ -166,7 +166,7 @@ char &llvm::MachineCSEID = MachineCSE::ID;
 
 INITIALIZE_PASS_BEGIN(MachineCSE, DEBUG_TYPE,
                       "Machine Common Subexpression Elimination", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineCSE, DEBUG_TYPE,
                     "Machine Common Subexpression Elimination", false, false)
@@ -943,7 +943,7 @@ bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-  DT = &getAnalysis<MachineDominatorTree>();
+  DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   LookAheadLimit = TII->getMachineCSELookAheadLimit();
   bool ChangedPRE, ChangedCSE;
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index c11263163a34ff..3bd3b8a386b419 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -139,7 +139,7 @@ INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
 
 void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<MachineTraceMetrics>();
diff --git a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
index 346cfedde390d3..6a8ede4feb9378 100644
--- a/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
+++ b/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -26,7 +26,7 @@ char MachineDominanceFrontier::ID = 0;
 
 INITIALIZE_PASS_BEGIN(MachineDominanceFrontier, "machine-domfrontier",
                 "Machine Dominance Frontier Construction", true, true)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier",
                 "Machine Dominance Frontier Construction", true, true)
 
@@ -38,7 +38,8 @@ char &llvm::MachineDominanceFrontierID = MachineDominanceFrontier::ID;
 
 bool MachineDominanceFrontier::runOnMachineFunction(MachineFunction &) {
   releaseMemory();
-  Base.analyze(getAnalysis<MachineDominatorTree>().getBase());
+  Base.analyze(
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree().getBase());
   return false;
 }
 
@@ -48,6 +49,6 @@ void MachineDominanceFrontier::releaseMemory() {
 
 void MachineDominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
diff --git a/llvm/lib/CodeGen/MachineDominators.cpp b/llvm/lib/CodeGen/MachineDominators.cpp
index 0632cde9c6f4e4..bdde8a4b435700 100644
--- a/llvm/lib/CodeGen/MachineDominators.cpp
+++ b/llvm/lib/CodeGen/MachineDominators.cpp
@@ -37,51 +37,59 @@ static cl::opt<bool, true> VerifyMachineDomInfoX(
 namespace llvm {
 template class DomTreeNodeBase<MachineBasicBlock>;
 template class DominatorTreeBase<MachineBasicBlock, false>; // DomTreeBase
-}
 
-char MachineDominatorTree::ID = 0;
+namespace DomTreeBuilder {
+template void Calculate<MBBDomTree>(MBBDomTree &DT);
+template void CalculateWithUpdates<MBBDomTree>(MBBDomTree &DT, MBBUpdates U);
 
-INITIALIZE_PASS(MachineDominatorTree, "machinedomtree",
-                "MachineDominator Tree Construction", true, true)
+template void InsertEdge<MBBDomTree>(MBBDomTree &DT, MachineBasicBlock *From,
+                                     MachineBasicBlock *To);
 
-char &llvm::MachineDominatorsID = MachineDominatorTree::ID;
+template void DeleteEdge<MBBDomTree>(MBBDomTree &DT, MachineBasicBlock *From,
+                                     MachineBasicBlock *To);
 
-void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.setPreservesAll();
-  MachineFunctionPass::getAnalysisUsage(AU);
+template void ApplyUpdates<MBBDomTree>(MBBDomTree &DT, MBBDomTreeGraphDiff &,
+                                       MBBDomTreeGraphDiff *);
+
+template bool Verify<MBBDomTree>(const MBBDomTree &DT,
+                                 MBBDomTree::VerificationLevel VL);
+} // namespace DomTreeBuilder
 }
 
-bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
-  calculate(F);
-  return false;
+char MachineDominatorTreeWrapperPass::ID = 0;
+
+INITIALIZE_PASS(MachineDominatorTreeWrapperPass, "machinedomtree",
+                "MachineDominator Tree Construction", true, true)
+
+MachineDominatorTreeWrapperPass::MachineDominatorTreeWrapperPass()
+    : MachineFunctionPass(ID) {
+  initializeMachineDominatorTreeWrapperPassPass(
+      *PassRegistry::getPassRegistry());
 }
 
 void MachineDominatorTree::calculate(MachineFunction &F) {
   CriticalEdgesToSplit.clear();
   NewBBs.clear();
-  DT.reset(new DomTreeBase<MachineBasicBlock>());
-  DT->recalculate(F);
+  recalculate(F);
 }
 
-MachineDominatorTree::MachineDominatorTree()
-    : MachineFunctionPass(ID) {
-  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
-}
+char &llvm::MachineDominatorsID = MachineDominatorTreeWrapperPass::ID;
 
-void MachineDominatorTree::releaseMemory() {
-  CriticalEdgesToSplit.clear();
-  DT.reset(nullptr);
+bool MachineDominatorTreeWrapperPass::runOnMachineFunction(MachineFunction &F) {
+  DT = MachineDominatorTree(F);
+  return false;
 }
 
-void MachineDominatorTree::verifyAnalysis() const {
-  if (DT && VerifyMachineDomInfo)
-    if (!DT->verify(MachineDomTree::VerificationLevel::Basic)) {
-      errs() << "MachineDominatorTree verification failed\n";
-      abort();
-    }
+void MachineDominatorTreeWrapperPass::releaseMemory() { DT.reset(); }
+
+void MachineDominatorTreeWrapperPass::verifyAnalysis() const {
+  if (VerifyMachineDomInfo && DT)
+    if (!DT->verify(MachineDominatorTree::VerificationLevel::Basic))
+      report_fatal_error("MachineDominatorTree verification failed!");
 }
 
-void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
+void MachineDominatorTreeWrapperPass::print(raw_ostream &OS,
+                                            const Module *) const {
   if (DT)
     DT->print(OS);
 }
@@ -103,7 +111,7 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
   for (CriticalEdge &Edge : CriticalEdgesToSplit) {
     // Update dominator information.
     MachineBasicBlock *Succ = Edge.ToBB;
-    MachineDomTreeNode *SuccDTNode = DT->getNode(Succ);
+    MachineDomTreeNode *SuccDTNode = Base::getNode(Succ);
 
     for (MachineBasicBlock *PredBB : Succ->predecessors()) {
       if (PredBB == Edge.NewBB)
@@ -126,7 +134,7 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
                                            "than one predecessor!");
         PredBB = *PredBB->pred_begin();
       }
-      if (!DT->dominates(SuccDTNode, DT->getNode(PredBB))) {
+      if (!Base::dominates(SuccDTNode, Base::getNode(PredBB))) {
         IsNewIDom[Idx] = false;
         break;
       }
@@ -138,13 +146,16 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
   Idx = 0;
   for (CriticalEdge &Edge : CriticalEdgesToSplit) {
     // We know FromBB dominates NewBB.
-    MachineDomTreeNode *NewDTNode = DT->addNewBlock(Edge.NewBB, Edge.FromBB);
+    MachineDomTreeNode *NewDTNode =
+        const_cast<MachineDominatorTree *>(this)->Base::addNewBlock(
+            Edge.NewBB, Edge.FromBB);
 
     // If all the other predecessors of "Succ" are dominated by "Succ" itself
     // then the new block is the new immediate dominator of "Succ". Otherwise,
     // the new block doesn't dominate anything.
     if (IsNewIDom[Idx])
-      DT->changeImmediateDominator(DT->getNode(Edge.ToBB), NewDTNode);
+      const_cast<MachineDominatorTree *>(this)->Base::changeImmediateDominator(
+          Base::getNode(Edge.ToBB), NewDTNode);
     ++Idx;
   }
   NewBBs.clear();
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 86eb259c090152..edf8988512c78d 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -191,7 +191,7 @@ namespace {
       AU.addRequired<MachineLoopInfo>();
       if (DisableHoistingToHotterBlocks != UseBFI::None)
         AU.addRequired<MachineBlockFrequencyInfo>();
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addPreserved<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -223,8 +223,8 @@ namespace {
     void HoistPostRA(MachineInstr *MI, unsigned Def, MachineLoop *CurLoop,
                      MachineBasicBlock *CurPreheader);
 
-    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
+    void ProcessMI(MachineInstr *MI, BitVector &RUDefs, BitVector &RUClobbers,
+                   SmallSet<int, 32> &StoredFIs,
                    SmallVectorImpl<CandidateInfo> &Candidates,
                    MachineLoop *CurLoop);
 
@@ -325,7 +325,7 @@ INITIALIZE_PASS_BEGIN(MachineLICM, DEBUG_TYPE,
                       "Machine Loop Invariant Code Motion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineLICM, DEBUG_TYPE,
                     "Machine Loop Invariant Code Motion", false, false)
@@ -334,7 +334,7 @@ INITIALIZE_PASS_BEGIN(EarlyMachineLICM, "early-machinelicm",
                       "Early Machine Loop Invariant Code Motion", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(EarlyMachineLICM, "early-machinelicm",
                     "Early Machine Loop Invariant Code Motion", false, false)
@@ -375,7 +375,7 @@ bool MachineLICMBase::runOnMachineFunction(MachineFunction &MF) {
   if (DisableHoistingToHotterBlocks != UseBFI::None)
     MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   MLI = &getAnalysis<MachineLoopInfo>();
-  DT  = &getAnalysis<MachineDominatorTree>();
+  DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   if (HoistConstLoads)
@@ -423,10 +423,47 @@ static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
   return false;
 }
 
+static void applyBitsNotInRegMaskToRegUnitsMask(const TargetRegisterInfo &TRI,
+                                                BitVector &RUs,
+                                                const uint32_t *Mask) {
+  // Iterate over the RegMask raw to avoid constructing a BitVector, which is
+  // expensive as it implies dynamically allocating memory.
+  //
+  // We also work backwards.
+  const unsigned NumRegs = TRI.getNumRegs();
+  const unsigned MaskWords = (NumRegs + 31) / 32;
+  for (unsigned K = 0; K < MaskWords; ++K) {
+    // We want to set the bits that aren't in RegMask, so flip it.
+    uint32_t Word = ~Mask[K];
+
+    // Iterate all set bits, starting from the right.
+    while (Word) {
+      const unsigned SetBitIdx = countr_zero(Word);
+
+      // The bits are numbered from the LSB in each word.
+      const unsigned PhysReg = (K * 32) + SetBitIdx;
+
+      // Clear the bit at SetBitIdx. Doing it this way appears to generate less
+      // instructions on x86. This works because negating a number will flip all
+      // the bits after SetBitIdx. So (Word & -Word) == (1 << SetBitIdx), but
+      // faster.
+      Word ^= Word & -Word;
+
+      if (PhysReg == NumRegs)
+        return;
+
+      if (PhysReg) {
+        for (MCRegUnitIterator RUI(PhysReg, &TRI); RUI.isValid(); ++RUI)
+          RUs.set(*RUI);
+      }
+    }
+  }
+}
+
 /// Examine the instruction for potentai LICM candidate. Also
 /// gather register def and frame object update information.
-void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
-                                BitVector &PhysRegClobbers,
+void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &RUDefs,
+                                BitVector &RUClobbers,
                                 SmallSet<int, 32> &StoredFIs,
                                 SmallVectorImpl<CandidateInfo> &Candidates,
                                 MachineLoop *CurLoop) {
@@ -448,7 +485,7 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // We can't hoist an instruction defining a physreg that is clobbered in
     // the loop.
     if (MO.isRegMask()) {
-      PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
+      applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, MO.getRegMask());
       continue;
     }
 
@@ -460,16 +497,22 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     assert(Reg.isPhysical() && "Not expecting virtual register!");
 
     if (!MO.isDef()) {
-      if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
-        // If it's using a non-loop-invariant register, then it's obviously not
-        // safe to hoist.
-        HasNonInvariantUse = true;
+      if (!HasNonInvariantUse) {
+        for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+          // If it's using a non-loop-invariant register, then it's obviously
+          // not safe to hoist.
+          if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
+            HasNonInvariantUse = true;
+            break;
+          }
+        }
+      }
       continue;
     }
 
     if (MO.isImplicit()) {
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        PhysRegClobbers.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        RUClobbers.set(*RUI);
       if (!MO.isDead())
         // Non-dead implicit def? This cannot be hoisted.
         RuledOut = true;
@@ -488,19 +531,18 @@ void MachineLICMBase::ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
     // If we have already seen another instruction that defines the same
     // register, then this is not safe.  Two defs is indicated by setting a
     // PhysRegClobbers bit.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
-      if (PhysRegDefs.test(*AS))
-        PhysRegClobbers.set(*AS);
+    for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+      if (RUDefs.test(*RUI)) {
+        RUClobbers.set(*RUI);
+        RuledOut = true;
+      } else if (RUClobbers.test(*RUI)) {
+        // MI defined register is seen defined by another instruction in
+        // the loop, it cannot be a LICM candidate.
+        RuledOut = true;
+      }
+
+      RUDefs.set(*RUI);
     }
-    // Need a second loop because MCRegAliasIterator can visit the same
-    // register twice.
-    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS)
-      PhysRegDefs.set(*AS);
-
-    if (PhysRegClobbers.test(Reg))
-      // MI defined register is seen defined by another instruction in
-      // the loop, it cannot be a LICM candidate.
-      RuledOut = true;
   }
 
   // Only consider reloads for now and remats which do not have register
@@ -521,9 +563,9 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
   if (!Preheader)
     return;
 
-  unsigned NumRegs = TRI->getNumRegs();
-  BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
-  BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  BitVector RUDefs(NumRegUnits);     // RUs defined once in the loop.
+  BitVector RUClobbers(NumRegUnits); // RUs defined more than once.
 
   SmallVector<CandidateInfo, 32> Candidates;
   SmallSet<int, 32> StoredFIs;
@@ -540,22 +582,21 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
     // FIXME: That means a reload that're reused in successor block(s) will not
     // be LICM'ed.
     for (const auto &LI : BB->liveins()) {
-      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
-        PhysRegDefs.set(*AI);
+      for (MCRegUnitIterator RUI(LI.PhysReg, TRI); RUI.isValid(); ++RUI)
+        RUDefs.set(*RUI);
     }
 
     // Funclet entry blocks will clobber all registers
     if (const uint32_t *Mask = BB->getBeginClobberMask(TRI))
-      PhysRegClobbers.setBitsNotInMask(Mask);
+      applyBitsNotInRegMaskToRegUnitsMask(*TRI, RUClobbers, Mask);
 
     SpeculationState = SpeculateUnknown;
     for (MachineInstr &MI : *BB)
-      ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates,
-                CurLoop);
+      ProcessMI(&MI, RUDefs, RUClobbers, StoredFIs, Candidates, CurLoop);
   }
 
   // Gather the registers read / clobbered by the terminator.
-  BitVector TermRegs(NumRegs);
+  BitVector TermRUs(NumRegUnits);
   MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
   if (TI != Preheader->end()) {
     for (const MachineOperand &MO : TI->operands()) {
@@ -564,8 +605,8 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       Register Reg = MO.getReg();
       if (!Reg)
         continue;
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        TermRegs.set(*AI);
+      for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+        TermRUs.set(*RUI);
     }
   }
 
@@ -583,24 +624,36 @@ void MachineLICMBase::HoistRegionPostRA(MachineLoop *CurLoop,
       continue;
 
     unsigned Def = Candidate.Def;
-    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
-      bool Safe = true;
-      MachineInstr *MI = Candidate.MI;
-      for (const MachineOperand &MO : MI->all_uses()) {
-        if (!MO.getReg())
-          continue;
-        Register Reg = MO.getReg();
-        if (PhysRegDefs.test(Reg) ||
-            PhysRegClobbers.test(Reg)) {
+    bool Safe = true;
+    for (MCRegUnitIterator RUI(Def, TRI); RUI.isValid(); ++RUI) {
+      if (RUClobbers.test(*RUI) || TermRUs.test(*RUI)) {
+        Safe = false;
+        break;
+      }
+    }
+
+    if (!Safe)
+      continue;
+
+    MachineInstr *MI = Candidate.MI;
+    for (const MachineOperand &MO : MI->all_uses()) {
+      if (!MO.getReg())
+        continue;
+      for (MCRegUnitIterator RUI(MO.getReg(), TRI); RUI.isValid(); ++RUI) {
+        if (RUDefs.test(*RUI) || RUClobbers.test(*RUI)) {
           // If it's using a non-loop-invariant register, then it's obviously
           // not safe to hoist.
           Safe = false;
           break;
         }
       }
-      if (Safe)
-        HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
+
+      if (!Safe)
+        break;
     }
+
+    if (Safe)
+      HoistPostRA(MI, Candidate.Def, CurLoop, CurPreheader);
   }
 }
 
diff --git a/llvm/lib/CodeGen/MachineLoopInfo.cpp b/llvm/lib/CodeGen/MachineLoopInfo.cpp
index 1019c53e57c6fb..9fb103945838ad 100644
--- a/llvm/lib/CodeGen/MachineLoopInfo.cpp
+++ b/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -36,14 +36,14 @@ MachineLoopInfo::MachineLoopInfo() : MachineFunctionPass(ID) {
 }
 INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops",
                 "Machine Natural Loop Construction", true, true)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(MachineLoopInfo, "machine-loops",
                 "Machine Natural Loop Construction", true, true)
 
 char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
 
 bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
-  calculate(getAnalysis<MachineDominatorTree>());
+  calculate(getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree());
   return false;
 }
 
@@ -54,7 +54,7 @@ void MachineLoopInfo::calculate(MachineDominatorTree &MDT) {
 
 void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 4f7d9d070cee6f..32f65f0d491398 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -219,7 +219,7 @@ INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
                       "Modulo Software Pipelining", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
                     "Modulo Software Pipelining", false, false)
@@ -248,7 +248,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
 
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   TII = MF->getSubtarget().getInstrInfo();
   RegClassInfo.runOnMachineFunction(*MF);
@@ -481,7 +481,7 @@ void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<AAResultsWrapperPass>();
   AU.addPreserved<AAResultsWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequired<LiveIntervals>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/CodeGen/MachineRegionInfo.cpp b/llvm/lib/CodeGen/MachineRegionInfo.cpp
index 45cdcbfeab9f12..d496b0c182c76e 100644
--- a/llvm/lib/CodeGen/MachineRegionInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -84,7 +84,7 @@ MachineRegionInfoPass::~MachineRegionInfoPass() = default;
 bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
   releaseMemory();
 
-  auto DT = &getAnalysis<MachineDominatorTree>();
+  auto DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   auto PDT = &getAnalysis<MachinePostDominatorTree>();
   auto DF = &getAnalysis<MachineDominanceFrontier>();
 
@@ -109,7 +109,7 @@ void MachineRegionInfoPass::verifyAnalysis() const {
 
 void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachinePostDominatorTree>();
   AU.addRequired<MachineDominanceFrontier>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -130,7 +130,7 @@ char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
 
 INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
                       "Detect single entry single exit regions", true, true)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
 INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE,
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 03e892a5e0d225..cf72f743808350 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -266,7 +266,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID;
 INITIALIZE_PASS_BEGIN(MachineScheduler, DEBUG_TYPE,
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
@@ -279,7 +279,7 @@ MachineScheduler::MachineScheduler() : MachineSchedulerBase(ID) {
 
 void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<TargetPassConfig>();
@@ -296,7 +296,7 @@ char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
 
 INITIALIZE_PASS_BEGIN(PostMachineScheduler, "postmisched",
                       "PostRA Machine Instruction Scheduler", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(PostMachineScheduler, "postmisched",
@@ -308,7 +308,7 @@ PostMachineScheduler::PostMachineScheduler() : MachineSchedulerBase(ID) {
 
 void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.addRequired<AAResultsWrapperPass>();
   AU.addRequired<TargetPassConfig>();
@@ -445,7 +445,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   // Initialize the context of the pass.
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   PassConfig = &getAnalysis<TargetPassConfig>();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index 3d40130b92c443..dcfa389e9bf416 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -184,7 +184,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
       AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachinePostDominatorTree>();
       AU.addRequired<MachineCycleInfoWrapperPass>();
       AU.addRequired<MachineBranchProbabilityInfo>();
@@ -274,7 +274,7 @@ char &llvm::MachineSinkingID = MachineSinking::ID;
 INITIALIZE_PASS_BEGIN(MachineSinking, DEBUG_TYPE,
                       "Machine code sinking", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MachineSinking, DEBUG_TYPE,
@@ -708,7 +708,7 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  DT = &getAnalysis<MachineDominatorTree>();
+  DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   PDT = &getAnalysis<MachinePostDominatorTree>();
   CI = &getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
   MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 131138e0649e4c..7548fc8141ec56 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -155,7 +155,7 @@ template struct llvm::GenericUniformityAnalysisImplDeleter<
 
 MachineUniformityInfo llvm::computeMachineUniformityInfo(
     MachineFunction &F, const MachineCycleInfo &cycleInfo,
-    const MachineDomTree &domTree, bool HasBranchDivergence) {
+    const MachineDominatorTree &domTree, bool HasBranchDivergence) {
   assert(F.getRegInfo().isSSA() && "Expected to be run on SSA form!");
   MachineUniformityInfo UI(domTree, cycleInfo);
   if (HasBranchDivergence)
@@ -187,19 +187,20 @@ MachineUniformityAnalysisPass::MachineUniformityAnalysisPass()
 INITIALIZE_PASS_BEGIN(MachineUniformityAnalysisPass, "machine-uniformity",
                       "Machine Uniformity Info Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(MachineUniformityAnalysisPass, "machine-uniformity",
                     "Machine Uniformity Info Analysis", true, true)
 
 void MachineUniformityAnalysisPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<MachineCycleInfoWrapperPass>();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 bool MachineUniformityAnalysisPass::runOnMachineFunction(MachineFunction &MF) {
-  auto &DomTree = getAnalysis<MachineDominatorTree>().getBase();
+  auto &DomTree =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree().getBase();
   auto &CI = getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
   // FIXME: Query TTI::hasBranchDivergence. -run-pass seems to end up with a
   // default NoTTI
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 0744089486313d..9ea238c61ed91e 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -226,7 +226,7 @@ namespace {
     // This is calculated only when trying to verify convergence control tokens.
     // Similar to the LLVM IR verifier, we calculate this locally instead of
     // relying on the pass manager.
-    MachineDomTree DT;
+    MachineDominatorTree DT;
 
     void visitMachineFunctionBefore();
     void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
@@ -3177,7 +3177,7 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) {
 }
 
 static void
-verifyConvergenceControl(const MachineFunction &MF, MachineDomTree &DT,
+verifyConvergenceControl(const MachineFunction &MF, MachineDominatorTree &DT,
                          std::function<void(const Twine &Message)> FailureCB) {
   MachineConvergenceVerifier CV;
   CV.initialize(&errs(), FailureCB, MF);
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 3254ec0b77fe78..592972f5c83b2b 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -139,7 +139,7 @@ void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveVariables>();
   AU.addPreserved<SlotIndexes>();
   AU.addPreserved<LiveIntervals>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachineLoopInfo>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -216,8 +216,8 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
 
   // TODO: we should use the incremental DomTree updater here.
   if (Changed)
-    if (auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>())
-      MDT->getBase().recalculate(MF);
+    if (auto *MDT = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>())
+      MDT->getDomTree().getBase().recalculate(MF);
 
   LoweredPHIs.clear();
   ImpDefs.clear();
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index 477a86dbe3f8c4..e6fe7a070f2a50 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -172,8 +172,8 @@ namespace {
       AU.addRequired<MachineLoopInfo>();
       AU.addPreserved<MachineLoopInfo>();
       if (Aggressive) {
-        AU.addRequired<MachineDominatorTree>();
-        AU.addPreserved<MachineDominatorTree>();
+        AU.addRequired<MachineDominatorTreeWrapperPass>();
+        AU.addPreserved<MachineDominatorTreeWrapperPass>();
       }
     }
 
@@ -487,7 +487,7 @@ char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
 
 INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
                       "Peephole Optimizations", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
                     "Peephole Optimizations", false, false)
@@ -1670,7 +1670,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
-  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+  DT = Aggressive ? &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree()
+                  : nullptr;
   MLI = &getAnalysis<MachineLoopInfo>();
   MF.setDelegate(this);
 
diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp
index ffd70a29f17150..8005050d5215ac 100644
--- a/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -85,8 +85,8 @@ namespace {
       AU.setPreservesCFG();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<TargetPassConfig>();
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineLoopInfo>();
       AU.addPreserved<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index fccb536b568691..496c2b999409b6 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -151,7 +151,7 @@ char &llvm::PrologEpilogCodeInserterID = PEI::ID;
 INITIALIZE_PASS_BEGIN(PEI, DEBUG_TYPE, "Prologue/Epilogue Insertion", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
 INITIALIZE_PASS_END(PEI, DEBUG_TYPE,
                     "Prologue/Epilogue Insertion & Frame Finalization", false,
@@ -167,7 +167,7 @@ STATISTIC(NumBytesStackSpace,
 void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addPreserved<MachineLoopInfo>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 5bd3b126aa1666..181337ca4d60f0 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -136,7 +136,7 @@ INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
 INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
 INITIALIZE_PASS_DEPENDENCY(LiveStacks)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 348277224c7aee..500ceb3d8b7006 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -160,7 +160,7 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(RegisterCoalescer)
 INITIALIZE_PASS_DEPENDENCY(MachineScheduler)
 INITIALIZE_PASS_DEPENDENCY(LiveStacks)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
@@ -213,8 +213,8 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<LiveDebugVariables>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<VirtRegMap>();
@@ -2729,7 +2729,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   // SlotIndexes::getApproxInstrDistance.
   Indexes->packIndexes();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
-  DomTree = &getAnalysis<MachineDominatorTree>();
+  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   Loops = &getAnalysis<MachineLoopInfo>();
   Bundles = &getAnalysis<EdgeBundles>();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.h b/llvm/lib/CodeGen/RegAllocGreedy.h
index 1941643bba9e66..06cf0828ea79b3 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.h
+++ b/llvm/lib/CodeGen/RegAllocGreedy.h
@@ -425,7 +425,7 @@ class LLVM_LIBRARY_VISIBILITY RAGreedy : public MachineFunctionPass,
                ZeroCostFoldedReloads || Copies);
     }
 
-    void add(RAGreedyStats other) {
+    void add(const RAGreedyStats &other) {
       Reloads += other.Reloads;
       FoldedReloads += other.FoldedReloads;
       ZeroCostFoldedReloads += other.ZeroCostFoldedReloads;
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index aea92788057971..88ba843067e5aa 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -557,8 +557,8 @@ void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
   au.addPreserved<MachineBlockFrequencyInfo>();
   au.addRequired<MachineLoopInfo>();
   au.addPreserved<MachineLoopInfo>();
-  au.addRequired<MachineDominatorTree>();
-  au.addPreserved<MachineDominatorTree>();
+  au.addRequired<MachineDominatorTreeWrapperPass>();
+  au.addPreserved<MachineDominatorTreeWrapperPass>();
   au.addRequired<VirtRegMap>();
   au.addPreserved<VirtRegMap>();
   MachineFunctionPass::getAnalysisUsage(au);
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index a4b2299abc20f9..fa9b7895239d38 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -225,7 +225,7 @@ class ShrinkWrap : public MachineFunctionPass {
   /// Initialize the pass for \p MF.
   void init(MachineFunction &MF) {
     RCI.runOnMachineFunction(MF);
-    MDT = &getAnalysis<MachineDominatorTree>();
+    MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
     MPDT = &getAnalysis<MachinePostDominatorTree>();
     Save = nullptr;
     Restore = nullptr;
@@ -262,7 +262,7 @@ class ShrinkWrap : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
     AU.addRequired<MachineBlockFrequencyInfo>();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<MachineOptimizationRemarkEmitterPass>();
@@ -289,7 +289,7 @@ char &llvm::ShrinkWrapID = ShrinkWrap::ID;
 
 INITIALIZE_PASS_BEGIN(ShrinkWrap, DEBUG_TYPE, "Shrink Wrap Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineOptimizationRemarkEmitterPass)
@@ -670,7 +670,7 @@ bool ShrinkWrap::postShrinkWrapping(bool HasCandidate, MachineFunction &MF,
   Save = NewSave;
   Restore = NewRestore;
 
-  MDT->runOnMachineFunction(MF);
+  MDT->recalculate(MF);
   MPDT->runOnMachineFunction(MF);
 
   assert((MDT->dominates(Save, Restore) && MPDT->dominates(Restore, Save)) &&
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 9fdc8a338b52a5..eb7a113b575f75 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -64,6 +65,7 @@ namespace {
     MachineFrameInfo *MFI = nullptr;
     const TargetInstrInfo *TII = nullptr;
     const MachineBlockFrequencyInfo *MBFI = nullptr;
+    SlotIndexes *Indexes = nullptr;
 
     // SSIntervals - Spill slot intervals.
     std::vector<LiveInterval*> SSIntervals;
@@ -152,6 +154,14 @@ namespace {
       AU.addRequired<MachineBlockFrequencyInfo>();
       AU.addPreserved<MachineBlockFrequencyInfo>();
       AU.addPreservedID(MachineDominatorsID);
+
+      // In some Target's pipeline, register allocation (RA) might be
+      // split into multiple phases based on register class. So, this pass
+      // may be invoked multiple times requiring it to save these analyses to be
+      // used by RA later.
+      AU.addPreserved<LiveIntervals>();
+      AU.addPreserved<LiveDebugVariables>();
+
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -496,8 +506,11 @@ bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
     ++I;
   }
 
-  for (MachineInstr *MI : toErase)
+  for (MachineInstr *MI : toErase) {
+    if (Indexes)
+      Indexes->removeMachineInstrFromMaps(*MI);
     MI->eraseFromParent();
+  }
 
   return changed;
 }
@@ -515,6 +528,7 @@ bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   LS = &getAnalysis<LiveStacks>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  Indexes = &getAnalysis<SlotIndexes>();
 
   bool Changed = false;
 
diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index 1a60e9abbe2e26..4cf025261d6204 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -90,7 +90,7 @@ char &llvm::UnreachableMachineBlockElimID = UnreachableMachineBlockElim::ID;
 
 void UnreachableMachineBlockElim::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineLoopInfo>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -98,7 +98,9 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   df_iterator_default_set<MachineBasicBlock*> Reachable;
   bool ModifiedPHI = false;
 
-  MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  MachineDominatorTreeWrapperPass *MDTWrapper =
+      getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+  MachineDominatorTree *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
   MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
 
   // Mark all reachable blocks.
diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp
index d40725838c943c..a74362e8883972 100644
--- a/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -53,7 +53,7 @@ struct XRayInstrumentation : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addPreserved<MachineLoopInfo>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -170,7 +170,9 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
 
     if (!IgnoreLoops) {
       // Get MachineDominatorTree or compute it on the fly if it's unavailable
-      auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+      auto *MDTWrapper =
+          getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+      auto *MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
       MachineDominatorTree ComputedMDT;
       if (!MDT) {
         ComputedMDT.getBase().recalculate(MF);
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 4e99893c3af7e7..0d56ce16ee7f0b 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -181,7 +181,7 @@ template class llvm::SymbolTableListTraits<Instruction,
 BasicBlock::BasicBlock(LLVMContext &C, const Twine &Name, Function *NewParent,
                        BasicBlock *InsertBefore)
     : Value(Type::getLabelTy(C), Value::BasicBlockVal),
-      IsNewDbgInfoFormat(UseNewDbgInfoFormat), Parent(nullptr) {
+      IsNewDbgInfoFormat(false), Parent(nullptr) {
 
   if (NewParent)
     insertInto(NewParent, InsertBefore);
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 9a4926c81dca27..fbca7cdfcf3f55 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -366,8 +366,8 @@ void DbgVariableRecord::setKillLocation() {
 }
 
 bool DbgVariableRecord::isKillLocation() const {
-  return (!hasArgList() && isa<MDNode>(getRawLocation())) ||
-         (getNumVariableLocationOps() == 0 && !getExpression()->isComplex()) ||
+  return (getNumVariableLocationOps() == 0 &&
+          !getExpression()->isComplex()) ||
          any_of(location_ops(), [](Value *V) { return isa<UndefValue>(V); });
 }
 
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 9360e6d7d274c8..3f735020e87402 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -83,8 +83,6 @@ static cl::opt<int> NonGlobalValueMaxNameSize(
     "non-global-value-max-name-size", cl::Hidden, cl::init(1024),
     cl::desc("Maximum size for the name of non-global values."));
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 void Function::convertToNewDbgValues() {
   IsNewDbgInfoFormat = true;
   for (auto &BB : *this) {
@@ -443,7 +441,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, unsigned AddrSpace,
     : GlobalObject(Ty, Value::FunctionVal,
                    OperandTraits<Function>::op_begin(this), 0, Linkage, name,
                    computeAddrSpace(AddrSpace, ParentModule)),
-      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
+      NumArgs(Ty->getNumParams()), IsNewDbgInfoFormat(false) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index aec927a8cf31d9..29272e627a1d1d 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -1268,23 +1268,12 @@ Instruction *Instruction::cloneImpl() const {
 
 void Instruction::swapProfMetadata() {
   MDNode *ProfileData = getBranchWeightMDNode(*this);
-  if (!ProfileData)
-    return;
-  unsigned FirstIdx = getBranchWeightOffset(ProfileData);
-  if (ProfileData->getNumOperands() != 2 + FirstIdx)
+  if (!ProfileData || ProfileData->getNumOperands() != 3)
     return;
 
-  unsigned SecondIdx = FirstIdx + 1;
-  SmallVector<Metadata *, 4> Ops;
-  // If there are more weights past the second, we can't swap them
-  if (ProfileData->getNumOperands() > SecondIdx + 1)
-    return;
-  for (unsigned Idx = 0; Idx < FirstIdx; ++Idx) {
-    Ops.push_back(ProfileData->getOperand(Idx));
-  }
-  // Switch the order of the weights
-  Ops.push_back(ProfileData->getOperand(SecondIdx));
-  Ops.push_back(ProfileData->getOperand(FirstIdx));
+  // The first operand is the name. Fetch them backwards and build a new one.
+  Metadata *Ops[] = {ProfileData->getOperand(0), ProfileData->getOperand(2),
+                     ProfileData->getOperand(1)};
   setMetadata(LLVMContext::MD_prof,
               MDNode::get(ProfileData->getContext(), Ops));
 }
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index de369bd62a6179..1213f078d05eca 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -5199,11 +5199,7 @@ void SwitchInstProfUpdateWrapper::init() {
   if (!ProfileData)
     return;
 
-  // FIXME: This check belongs in ProfDataUtils. Its almost equivalent to
-  // getValidBranchWeightMDNode(), but the need to use llvm_unreachable
-  // makes them slightly different.
-  if (ProfileData->getNumOperands() !=
-      SI.getNumSuccessors() + getBranchWeightOffset(ProfileData)) {
+  if (ProfileData->getNumOperands() != SI.getNumSuccessors() + 1) {
     llvm_unreachable("number of prof branch_weights metadata operands does "
                      "not correspond to number of succesors");
   }
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index 000027754d13ec..bd68db3a6f9616 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -35,8 +35,8 @@ MDNode *MDBuilder::createFPMath(float Accuracy) {
 }
 
 MDNode *MDBuilder::createBranchWeights(uint32_t TrueWeight,
-                                       uint32_t FalseWeight, bool IsExpected) {
-  return createBranchWeights({TrueWeight, FalseWeight}, IsExpected);
+                                       uint32_t FalseWeight) {
+  return createBranchWeights({TrueWeight, FalseWeight});
 }
 
 MDNode *MDBuilder::createLikelyBranchWeights() {
@@ -49,19 +49,15 @@ MDNode *MDBuilder::createUnlikelyBranchWeights() {
   return createBranchWeights(1, (1U << 20) - 1);
 }
 
-MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights,
-                                       bool IsExpected) {
+MDNode *MDBuilder::createBranchWeights(ArrayRef<uint32_t> Weights) {
   assert(Weights.size() >= 1 && "Need at least one branch weights!");
 
-  unsigned int Offset = IsExpected ? 2 : 1;
-  SmallVector<Metadata *, 4> Vals(Weights.size() + Offset);
+  SmallVector<Metadata *, 4> Vals(Weights.size() + 1);
   Vals[0] = createString("branch_weights");
-  if (IsExpected)
-    Vals[1] = createString("expected");
 
   Type *Int32Ty = Type::getInt32Ty(Context);
   for (unsigned i = 0, e = Weights.size(); i != e; ++i)
-    Vals[i + Offset] = createConstant(ConstantInt::get(Int32Ty, Weights[i]));
+    Vals[i + 1] = createConstant(ConstantInt::get(Int32Ty, Weights[i]));
 
   return MDNode::get(Context, Vals);
 }
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 9459e7a985d842..06164a3e6aeca8 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1196,10 +1196,10 @@ MDNode *MDNode::mergeDirectCallProfMetadata(MDNode *A, MDNode *B,
   StringRef AProfName = AMDS->getString();
   StringRef BProfName = BMDS->getString();
   if (AProfName == "branch_weights" && BProfName == "branch_weights") {
-    ConstantInt *AInstrWeight = mdconst::dyn_extract<ConstantInt>(
-        A->getOperand(getBranchWeightOffset(A)));
-    ConstantInt *BInstrWeight = mdconst::dyn_extract<ConstantInt>(
-        B->getOperand(getBranchWeightOffset(B)));
+    ConstantInt *AInstrWeight =
+        mdconst::dyn_extract<ConstantInt>(A->getOperand(1));
+    ConstantInt *BInstrWeight =
+        mdconst::dyn_extract<ConstantInt>(B->getOperand(1));
     assert(AInstrWeight && BInstrWeight && "verified by LLVM verifier");
     return MDNode::get(Ctx,
                        {MDHelper.createString("branch_weights"),
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 55c282cb25e793..f97dd18c736c51 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -54,8 +54,6 @@
 
 using namespace llvm;
 
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
 //===----------------------------------------------------------------------===//
 // Methods to implement the globals and functions lists.
 //
@@ -74,7 +72,7 @@ template class llvm::SymbolTableListTraits<GlobalIFunc>;
 Module::Module(StringRef MID, LLVMContext &C)
     : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>(-1)),
       ModuleID(std::string(MID)), SourceFileName(std::string(MID)), DL(""),
-      IsNewDbgInfoFormat(UseNewDbgInfoFormat) {
+      IsNewDbgInfoFormat(false) {
   Context.addModule(this);
 }
 
diff --git a/llvm/lib/IR/ProfDataUtils.cpp b/llvm/lib/IR/ProfDataUtils.cpp
index c4b1ed55de8a22..51e78dc5e6c008 100644
--- a/llvm/lib/IR/ProfDataUtils.cpp
+++ b/llvm/lib/IR/ProfDataUtils.cpp
@@ -40,6 +40,9 @@ namespace {
 // We maintain some constants here to ensure that we access the branch weights
 // correctly, and can change the behavior in the future if the layout changes
 
+// The index at which the weights vector starts
+constexpr unsigned WeightsIdx = 1;
+
 // the minimum number of operands for MD_prof nodes with branch weights
 constexpr unsigned MinBWOps = 3;
 
@@ -72,7 +75,6 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
   assert(isBranchWeightMD(ProfileData) && "wrong metadata");
 
   unsigned NOps = ProfileData->getNumOperands();
-  unsigned WeightsIdx = getBranchWeightOffset(ProfileData);
   assert(WeightsIdx < NOps && "Weights Index must be less than NOps.");
   Weights.resize(NOps - WeightsIdx);
 
@@ -80,8 +82,8 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
     ConstantInt *Weight =
         mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
     assert(Weight && "Malformed branch_weight in MD_prof node");
-    assert(Weight->getValue().getActiveBits() <= (sizeof(T) * 8) &&
-           "Too many bits for MD_prof branch_weight");
+    assert(Weight->getValue().getActiveBits() <= 32 &&
+           "Too many bits for uint32_t");
     Weights[Idx - WeightsIdx] = Weight->getZExtValue();
   }
 }
@@ -121,26 +123,6 @@ bool hasValidBranchWeightMD(const Instruction &I) {
   return getValidBranchWeightMDNode(I);
 }
 
-bool hasBranchWeightOrigin(const Instruction &I) {
-  auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
-  return hasBranchWeightOrigin(ProfileData);
-}
-
-bool hasBranchWeightOrigin(const MDNode *ProfileData) {
-  if (!isBranchWeightMD(ProfileData))
-    return false;
-  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(1));
-  // NOTE: if we ever have more types of branch weight provenance,
-  // we need to check the string value is "expected". For now, we
-  // supply a more generic API, and avoid the spurious comparisons.
-  assert(ProfDataName == nullptr || ProfDataName->getString() == "expected");
-  return ProfDataName != nullptr;
-}
-
-unsigned getBranchWeightOffset(const MDNode *ProfileData) {
-  return hasBranchWeightOrigin(ProfileData) ? 2 : 1;
-}
-
 MDNode *getBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = I.getMetadata(LLVMContext::MD_prof);
   if (!isBranchWeightMD(ProfileData))
@@ -150,9 +132,7 @@ MDNode *getBranchWeightMDNode(const Instruction &I) {
 
 MDNode *getValidBranchWeightMDNode(const Instruction &I) {
   auto *ProfileData = getBranchWeightMDNode(I);
-  auto Offset = getBranchWeightOffset(ProfileData);
-  if (ProfileData &&
-      ProfileData->getNumOperands() == Offset + I.getNumSuccessors())
+  if (ProfileData && ProfileData->getNumOperands() == 1 + I.getNumSuccessors())
     return ProfileData;
   return nullptr;
 }
@@ -211,8 +191,7 @@ bool extractProfTotalWeight(const MDNode *ProfileData, uint64_t &TotalVal) {
     return false;
 
   if (ProfDataName->getString() == "branch_weights") {
-    unsigned Offset = getBranchWeightOffset(ProfileData);
-    for (unsigned Idx = Offset; Idx < ProfileData->getNumOperands(); ++Idx) {
+    for (unsigned Idx = 1; Idx < ProfileData->getNumOperands(); Idx++) {
       auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(Idx));
       assert(V && "Malformed branch_weight in MD_prof node");
       TotalVal += V->getValue().getZExtValue();
@@ -233,10 +212,9 @@ bool extractProfTotalWeight(const Instruction &I, uint64_t &TotalVal) {
   return extractProfTotalWeight(I.getMetadata(LLVMContext::MD_prof), TotalVal);
 }
 
-void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
-                      bool IsExpected) {
+void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights) {
   MDBuilder MDB(I.getContext());
-  MDNode *BranchWeights = MDB.createBranchWeights(Weights, IsExpected);
+  MDNode *BranchWeights = MDB.createBranchWeights(Weights);
   I.setMetadata(LLVMContext::MD_prof, BranchWeights);
 }
 
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 0e592266d52ad3..018a703d6f21a8 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -104,7 +104,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -4885,10 +4884,8 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
 
   // Check consistency of !prof branch_weights metadata.
   if (ProfName == "branch_weights") {
-    unsigned int Offset = getBranchWeightOffset(MD);
     if (isa<InvokeInst>(&I)) {
-      Check(MD->getNumOperands() == (1 + Offset) ||
-                MD->getNumOperands() == (2 + Offset),
+      Check(MD->getNumOperands() == 2 || MD->getNumOperands() == 3,
             "Wrong number of InvokeInst branch_weights operands", MD);
     } else {
       unsigned ExpectedNumOperands = 0;
@@ -4908,10 +4905,10 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
         CheckFailed("!prof branch_weights are not allowed for this instruction",
                     MD);
 
-      Check(MD->getNumOperands() == Offset + ExpectedNumOperands,
+      Check(MD->getNumOperands() == 1 + ExpectedNumOperands,
             "Wrong number of operands", MD);
     }
-    for (unsigned i = Offset; i < MD->getNumOperands(); ++i) {
+    for (unsigned i = 1; i < MD->getNumOperands(); ++i) {
       auto &MDO = MD->getOperand(i);
       Check(MDO, "second operand should not be null", MD);
       Check(mdconst::dyn_extract<ConstantInt>(MDO),
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index a7157e7a37b439..8490853eda87c2 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -820,7 +820,7 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
   for (MCSection &Sec : *this) {
     // Create dummy fragments to eliminate any empty sections, this simplifies
     // layout.
-    if (Sec.getFragmentList().empty())
+    if (Sec.empty())
       new MCDataFragment(&Sec);
 
     Sec.setOrdinal(SectionIndex++);
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index a83c03749ca332..26fed22be5c872 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -44,6 +44,7 @@
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -555,16 +556,42 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
   if (GroupSym)
     Group = GroupSym->getName();
   assert(!(LinkedToSym && LinkedToSym->getName().empty()));
-  // Do the lookup, if we have a hit, return it.
-  auto IterBool = ELFUniquingMap.insert(std::make_pair(
-      ELFSectionKey{Section.str(), Group,
-                    LinkedToSym ? LinkedToSym->getName() : "", UniqueID},
-      nullptr));
-  auto &Entry = *IterBool.first;
-  if (!IterBool.second)
-    return Entry.second;
 
-  StringRef CachedName = Entry.first.SectionName;
+  // Sections are differentiated by the quadruple (section_name, group_name,
+  // unique_id, link_to_symbol_name). Sections sharing the same quadruple are
+  // combined into one section. As an optimization, non-unique sections without
+  // group or linked-to symbol have a shorter unique-ing key.
+  std::pair<StringMap<MCSectionELF *>::iterator, bool> EntryNewPair;
+  // Length of the section name, which are the first SectionLen bytes of the key
+  unsigned SectionLen;
+  if (GroupSym || LinkedToSym || UniqueID != MCSection::NonUniqueID) {
+    SmallString<128> Buffer;
+    Section.toVector(Buffer);
+    SectionLen = Buffer.size();
+    Buffer.push_back(0); // separator which cannot occur in the name
+    if (GroupSym)
+      Buffer.append(GroupSym->getName());
+    Buffer.push_back(0); // separator which cannot occur in the name
+    if (LinkedToSym)
+      Buffer.append(LinkedToSym->getName());
+    support::endian::write(Buffer, UniqueID, endianness::native);
+    StringRef UniqueMapKey = StringRef(Buffer);
+    EntryNewPair = ELFUniquingMap.insert(std::make_pair(UniqueMapKey, nullptr));
+  } else if (!Section.isSingleStringRef()) {
+    SmallString<128> Buffer;
+    StringRef UniqueMapKey = Section.toStringRef(Buffer);
+    SectionLen = UniqueMapKey.size();
+    EntryNewPair = ELFUniquingMap.insert(std::make_pair(UniqueMapKey, nullptr));
+  } else {
+    StringRef UniqueMapKey = Section.getSingleStringRef();
+    SectionLen = UniqueMapKey.size();
+    EntryNewPair = ELFUniquingMap.insert(std::make_pair(UniqueMapKey, nullptr));
+  }
+
+  if (!EntryNewPair.second)
+    return EntryNewPair.first->second;
+
+  StringRef CachedName = EntryNewPair.first->getKey().take_front(SectionLen);
 
   SectionKind Kind;
   if (Flags & ELF::SHF_ARM_PURECODE)
@@ -608,7 +635,7 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
   MCSectionELF *Result =
       createELFSectionImpl(CachedName, Type, Flags, Kind, EntrySize, GroupSym,
                            IsComdat, UniqueID, LinkedToSym);
-  Entry.second = Result;
+  EntryNewPair.first->second = Result;
 
   recordELFMergeableSectionInfo(Result->getName(), Result->getFlags(),
                                 Result->getUniqueID(), Result->getEntrySize());
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 8d3873bed9efdd..ae4e6915fa294c 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -202,7 +202,7 @@ void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
 MCFragment *MCObjectStreamer::getCurrentFragment() const {
   assert(getCurrentSectionOnly() && "No current section!");
 
-  if (CurInsertionPoint != getCurrentSectionOnly()->getFragmentList().begin())
+  if (CurInsertionPoint != getCurrentSectionOnly()->begin())
     return &*std::prev(CurInsertionPoint);
 
   return nullptr;
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 12e69f70537b78..9848d7fafe764a 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -130,10 +130,13 @@ LLVM_DUMP_METHOD void MCSection::dump() const {
 
   OS << "<MCSection Name:" << getName();
   OS << " Fragments:[\n      ";
-  for (auto it = begin(), ie = end(); it != ie; ++it) {
-    if (it != begin())
+  bool First = true;
+  for (auto &F : *this) {
+    if (First)
+      First = false;
+    else
       OS << ",\n      ";
-    it->dump();
+    F.dump();
   }
   OS << "]>";
 }
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index d17e6e125d8727..de8bde4211b49b 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -767,11 +767,9 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
   if (!Asm.CGProfile.empty()) {
     MCSection *CGProfileSection = Asm.getContext().getMachOSection(
         "__LLVM", "__cg_profile", 0, SectionKind::getMetadata());
-    MCDataFragment *Frag = dyn_cast_or_null<MCDataFragment>(
-        &*CGProfileSection->getFragmentList().begin());
-    assert(Frag && "call graph profile section not reserved");
-    Frag->getContents().clear();
-    raw_svector_ostream OS(Frag->getContents());
+    auto &Frag = cast<MCDataFragment>(*CGProfileSection->begin());
+    Frag.getContents().clear();
+    raw_svector_ostream OS(Frag.getContents());
     for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
       uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
       uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 788e92f94b2689..451269608f1799 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -1857,14 +1857,9 @@ uint64_t WasmObjectWriter::writeOneObject(MCAssembler &Asm,
       report_fatal_error(".fini_array sections are unsupported");
     if (!WS.getName().starts_with(".init_array"))
       continue;
-    if (WS.getFragmentList().empty())
-      continue;
-
-    // init_array is expected to contain a single non-empty data fragment
-    if (WS.getFragmentList().size() != 3)
-      report_fatal_error("only one .init_array section fragment supported");
-
     auto IT = WS.begin();
+    if (IT == WS.end())
+      continue;
     const MCFragment &EmptyFrag = *IT;
     if (EmptyFrag.getKind() != MCFragment::FT_Data)
       report_fatal_error(".init_array section should be aligned");
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 3c9ff71b6b0622..a2b6c4e5c3a5c5 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -354,7 +354,7 @@ void WinCOFFWriter::defineSection(const MCSectionCOFF &MCSec,
   Section->MCSection = &MCSec;
   SectionMap[&MCSec] = Section;
 
-  if (UseOffsetLabels && !MCSec.getFragmentList().empty()) {
+  if (UseOffsetLabels && !MCSec.empty()) {
     const uint32_t Interval = 1 << OffsetLabelIntervalBits;
     uint32_t N = 1;
     for (uint32_t Off = Interval, E = Layout.getSectionAddressSize(&MCSec);
diff --git a/llvm/lib/ObjCopy/ELF/ELFObject.h b/llvm/lib/ObjCopy/ELF/ELFObject.h
index f72c109b6009e8..2b1895a30b41ed 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObject.h
+++ b/llvm/lib/ObjCopy/ELF/ELFObject.h
@@ -910,7 +910,7 @@ class RelocationSection
 
 public:
   RelocationSection(const Object &O) : Obj(O) {}
-  void addRelocation(Relocation Rel) { Relocations.push_back(Rel); }
+  void addRelocation(const Relocation &Rel) { Relocations.push_back(Rel); }
   Error accept(SectionVisitor &Visitor) const override;
   Error accept(MutableSectionVisitor &Visitor) override;
   Error removeSectionReferences(
diff --git a/llvm/lib/ProfileData/MemProf.cpp b/llvm/lib/ProfileData/MemProf.cpp
index 4ca868722ec4e3..6d784053f877d4 100644
--- a/llvm/lib/ProfileData/MemProf.cpp
+++ b/llvm/lib/ProfileData/MemProf.cpp
@@ -394,7 +394,7 @@ Expected<MemProfSchema> readMemProfSchema(const unsigned char *&Buffer) {
     }
     Result.push_back(static_cast<Meta>(Tag));
   }
-  // Advace the buffer to one past the schema if we succeeded.
+  // Advance the buffer to one past the schema if we succeeded.
   Buffer = Ptr;
   return Result;
 }
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 283fcc153b33aa..1209bf71a287d7 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -68,6 +68,10 @@ enum class fltNonfiniteBehavior {
   // `fltNanEncoding` enum. We treat all NaNs as quiet, as the available
   // encodings do not distinguish between signalling and quiet NaN.
   NanOnly,
+
+  // This behavior is present in Float6E3M2FN and Float6E2M3FN types,
+  // which do not support Inf or NaN values.
+  FiniteOnly,
 };
 
 // How NaN values are represented. This is curently only used in combination
@@ -139,6 +143,10 @@ static constexpr fltSemantics semFloat8E4M3FNUZ = {
 static constexpr fltSemantics semFloat8E4M3B11FNUZ = {
     4, -10, 4, 8, fltNonfiniteBehavior::NanOnly, fltNanEncoding::NegativeZero};
 static constexpr fltSemantics semFloatTF32 = {127, -126, 11, 19};
+static constexpr fltSemantics semFloat6E3M2FN = {
+    4, -2, 3, 6, fltNonfiniteBehavior::FiniteOnly};
+static constexpr fltSemantics semFloat6E2M3FN = {
+    2, 0, 4, 6, fltNonfiniteBehavior::FiniteOnly};
 static constexpr fltSemantics semX87DoubleExtended = {16383, -16382, 64, 80};
 static constexpr fltSemantics semBogus = {0, 0, 0, 0};
 
@@ -206,6 +214,10 @@ const llvm::fltSemantics &APFloatBase::EnumToSemantics(Semantics S) {
     return Float8E4M3B11FNUZ();
   case S_FloatTF32:
     return FloatTF32();
+  case S_Float6E3M2FN:
+    return Float6E3M2FN();
+  case S_Float6E2M3FN:
+    return Float6E2M3FN();
   case S_x87DoubleExtended:
     return x87DoubleExtended();
   }
@@ -238,6 +250,10 @@ APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
     return S_Float8E4M3B11FNUZ;
   else if (&Sem == &llvm::APFloat::FloatTF32())
     return S_FloatTF32;
+  else if (&Sem == &llvm::APFloat::Float6E3M2FN())
+    return S_Float6E3M2FN;
+  else if (&Sem == &llvm::APFloat::Float6E2M3FN())
+    return S_Float6E2M3FN;
   else if (&Sem == &llvm::APFloat::x87DoubleExtended())
     return S_x87DoubleExtended;
   else
@@ -260,6 +276,8 @@ const fltSemantics &APFloatBase::Float8E4M3B11FNUZ() {
   return semFloat8E4M3B11FNUZ;
 }
 const fltSemantics &APFloatBase::FloatTF32() { return semFloatTF32; }
+const fltSemantics &APFloatBase::Float6E3M2FN() { return semFloat6E3M2FN; }
+const fltSemantics &APFloatBase::Float6E2M3FN() { return semFloat6E2M3FN; }
 const fltSemantics &APFloatBase::x87DoubleExtended() {
   return semX87DoubleExtended;
 }
@@ -878,6 +896,9 @@ void IEEEFloat::copySignificand(const IEEEFloat &rhs) {
    for the significand.  If double or longer, this is a signalling NaN,
    which may not be ideal.  If float, this is QNaN(0).  */
 void IEEEFloat::makeNaN(bool SNaN, bool Negative, const APInt *fill) {
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
+    llvm_unreachable("This floating point format does not support NaN");
+
   category = fcNaN;
   sign = Negative;
   exponent = exponentNaN();
@@ -1499,16 +1520,18 @@ static void tcSetLeastSignificantBits(APInt::WordType *dst, unsigned parts,
 /* Handle overflow.  Sign is preserved.  We either become infinity or
    the largest finite number.  */
 IEEEFloat::opStatus IEEEFloat::handleOverflow(roundingMode rounding_mode) {
-  /* Infinity?  */
-  if (rounding_mode == rmNearestTiesToEven ||
-      rounding_mode == rmNearestTiesToAway ||
-      (rounding_mode == rmTowardPositive && !sign) ||
-      (rounding_mode == rmTowardNegative && sign)) {
-    if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
-      makeNaN(false, sign);
-    else
-      category = fcInfinity;
-    return (opStatus) (opOverflow | opInexact);
+  if (semantics->nonFiniteBehavior != fltNonfiniteBehavior::FiniteOnly) {
+    /* Infinity?  */
+    if (rounding_mode == rmNearestTiesToEven ||
+        rounding_mode == rmNearestTiesToAway ||
+        (rounding_mode == rmTowardPositive && !sign) ||
+        (rounding_mode == rmTowardNegative && sign)) {
+      if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
+        makeNaN(false, sign);
+      else
+        category = fcInfinity;
+      return static_cast<opStatus>(opOverflow | opInexact);
+    }
   }
 
   /* Otherwise we become the largest finite number.  */
@@ -3518,13 +3541,15 @@ APInt IEEEFloat::convertIEEEFloatToAPInt() const {
     myexponent = ::exponentZero(S) + bias;
     mysignificand.fill(0);
   } else if (category == fcInfinity) {
-    if (S.nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) {
+    if (S.nonFiniteBehavior == fltNonfiniteBehavior::NanOnly ||
+        S.nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
       llvm_unreachable("semantics don't support inf!");
-    }
     myexponent = ::exponentInf(S) + bias;
     mysignificand.fill(0);
   } else {
     assert(category == fcNaN && "Unknown category!");
+    if (S.nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
+      llvm_unreachable("semantics don't support NaN!");
     myexponent = ::exponentNaN(S) + bias;
     std::copy_n(significandParts(), mysignificand.size(),
                 mysignificand.begin());
@@ -3605,6 +3630,16 @@ APInt IEEEFloat::convertFloatTF32APFloatToAPInt() const {
   return convertIEEEFloatToAPInt<semFloatTF32>();
 }
 
+APInt IEEEFloat::convertFloat6E3M2FNAPFloatToAPInt() const {
+  assert(partCount() == 1);
+  return convertIEEEFloatToAPInt<semFloat6E3M2FN>();
+}
+
+APInt IEEEFloat::convertFloat6E2M3FNAPFloatToAPInt() const {
+  assert(partCount() == 1);
+  return convertIEEEFloatToAPInt<semFloat6E2M3FN>();
+}
+
 // This function creates an APInt that is just a bit map of the floating
 // point constant as it would appear in memory.  It is not a conversion,
 // and treating the result as a normal integer is unlikely to be useful.
@@ -3646,6 +3681,12 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics *)&semFloatTF32)
     return convertFloatTF32APFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semFloat6E3M2FN)
+    return convertFloat6E3M2FNAPFloatToAPInt();
+
+  if (semantics == (const llvm::fltSemantics *)&semFloat6E2M3FN)
+    return convertFloat6E2M3FNAPFloatToAPInt();
+
   assert(semantics == (const llvm::fltSemantics*)&semX87DoubleExtended &&
          "unknown format!");
   return convertF80LongDoubleAPFloatToAPInt();
@@ -3862,6 +3903,14 @@ void IEEEFloat::initFromFloatTF32APInt(const APInt &api) {
   initFromIEEEAPInt<semFloatTF32>(api);
 }
 
+void IEEEFloat::initFromFloat6E3M2FNAPInt(const APInt &api) {
+  initFromIEEEAPInt<semFloat6E3M2FN>(api);
+}
+
+void IEEEFloat::initFromFloat6E2M3FNAPInt(const APInt &api) {
+  initFromIEEEAPInt<semFloat6E2M3FN>(api);
+}
+
 /// Treat api as containing the bits of a floating point number.
 void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
   assert(api.getBitWidth() == Sem->sizeInBits);
@@ -3891,6 +3940,10 @@ void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
     return initFromFloat8E4M3B11FNUZAPInt(api);
   if (Sem == &semFloatTF32)
     return initFromFloatTF32APInt(api);
+  if (Sem == &semFloat6E3M2FN)
+    return initFromFloat6E3M2FNAPInt(api);
+  if (Sem == &semFloat6E2M3FN)
+    return initFromFloat6E2M3FNAPInt(api);
 
   llvm_unreachable(nullptr);
 }
@@ -4328,7 +4381,8 @@ int IEEEFloat::getExactLog2Abs() const {
 bool IEEEFloat::isSignaling() const {
   if (!isNaN())
     return false;
-  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly)
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly ||
+      semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
     return false;
 
   // IEEE-754R 2008 6.2.1: A signaling NaN bit string should be encoded with the
@@ -4387,6 +4441,10 @@ IEEEFloat::opStatus IEEEFloat::next(bool nextDown) {
         // nextUp(getLargest()) == NAN
         makeNaN();
         break;
+      } else if (semantics->nonFiniteBehavior ==
+                 fltNonfiniteBehavior::FiniteOnly) {
+        // nextUp(getLargest()) == getLargest()
+        break;
       } else {
         // nextUp(getLargest()) == INFINITY
         APInt::tcSet(significandParts(), 0, partCount());
@@ -4477,6 +4535,9 @@ APFloatBase::ExponentType IEEEFloat::exponentZero() const {
 }
 
 void IEEEFloat::makeInf(bool Negative) {
+  if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::FiniteOnly)
+    llvm_unreachable("This floating point format does not support Inf");
+
   if (semantics->nonFiniteBehavior == fltNonfiniteBehavior::NanOnly) {
     // There is no Inf, so make NaN instead.
     makeNaN(false, Negative);
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 7da540f8ef8e50..da11539eab348f 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -90,6 +90,8 @@ class AArch64AsmPrinter : public AsmPrinter {
     return MCInstLowering.lowerOperand(MO, MCOp);
   }
 
+  const MCExpr *lowerConstantPtrAuth(const ConstantPtrAuth &CPA) override;
+
   void emitStartOfAsmFile(Module &M) override;
   void emitJumpTableInfo() override;
   std::tuple<const MCSymbol *, uint64_t, const MCSymbol *,
@@ -1575,6 +1577,52 @@ void AArch64AsmPrinter::emitPtrauthBranch(const MachineInstr *MI) {
   assert(STI->getInstrInfo()->getInstSizeInBytes(*MI) >= InstsEmitted * 4);
 }
 
+const MCExpr *
+AArch64AsmPrinter::lowerConstantPtrAuth(const ConstantPtrAuth &CPA) {
+  MCContext &Ctx = OutContext;
+
+  // Figure out the base symbol and the addend, if any.
+  APInt Offset(64, 0);
+  const Value *BaseGV = CPA.getPointer()->stripAndAccumulateConstantOffsets(
+      getDataLayout(), Offset, /*AllowNonInbounds=*/true);
+
+  auto *BaseGVB = dyn_cast<GlobalValue>(BaseGV);
+
+  // If we can't understand the referenced ConstantExpr, there's nothing
+  // else we can do: emit an error.
+  if (!BaseGVB) {
+    BaseGV->getContext().emitError(
+        "cannot resolve target base/addend of ptrauth constant");
+    return nullptr;
+  }
+
+  // If there is an addend, turn that into the appropriate MCExpr.
+  const MCExpr *Sym = MCSymbolRefExpr::create(getSymbol(BaseGVB), Ctx);
+  if (Offset.sgt(0))
+    Sym = MCBinaryExpr::createAdd(
+        Sym, MCConstantExpr::create(Offset.getSExtValue(), Ctx), Ctx);
+  else if (Offset.slt(0))
+    Sym = MCBinaryExpr::createSub(
+        Sym, MCConstantExpr::create((-Offset).getSExtValue(), Ctx), Ctx);
+
+  uint64_t KeyID = CPA.getKey()->getZExtValue();
+  // We later rely on valid KeyID value in AArch64PACKeyIDToString call from
+  // AArch64AuthMCExpr::printImpl, so fail fast.
+  if (KeyID > AArch64PACKey::LAST)
+    report_fatal_error("AArch64 PAC Key ID '" + Twine(KeyID) +
+                       "' out of range [0, " +
+                       Twine((unsigned)AArch64PACKey::LAST) + "]");
+
+  uint64_t Disc = CPA.getDiscriminator()->getZExtValue();
+  if (!isUInt<16>(Disc))
+    report_fatal_error("AArch64 PAC Discriminator '" + Twine(Disc) +
+                       "' out of range [0, 0xFFFF]");
+
+  // Finally build the complete @AUTH expr.
+  return AArch64AuthMCExpr::create(Sym, Disc, AArch64PACKey::ID(KeyID),
+                                   CPA.hasAddressDiscriminator(), Ctx);
+}
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 3f244ba10102af..154ae43b29d574 100644
--- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -50,7 +50,8 @@ struct LDTLSCleanup : public MachineFunctionPass {
       return false;
     }
 
-    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    MachineDominatorTree *DT =
+        &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
     return VisitNode(DT->getRootNode(), 0);
   }
 
@@ -138,7 +139,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 2a4a3c0df08f9c..68243258a68f5d 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -126,7 +126,7 @@ char AArch64ConditionOptimizer::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64ConditionOptimizer, "aarch64-condopt",
                       "AArch64 CondOpt Pass", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(AArch64ConditionOptimizer, "aarch64-condopt",
                     "AArch64 CondOpt Pass", false, false)
 
@@ -135,8 +135,8 @@ FunctionPass *llvm::createAArch64ConditionOptimizerPass() {
 }
 
 void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -332,7 +332,7 @@ bool AArch64ConditionOptimizer::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   TII = MF.getSubtarget().getInstrInfo();
-  DomTree = &getAnalysis<MachineDominatorTree>();
+  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MRI = &MF.getRegInfo();
 
   bool Changed = false;
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 8c16a88a13a408..9a788123b1ffa3 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -795,7 +795,7 @@ char AArch64ConditionalCompares::ID = 0;
 INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
                     "AArch64 CCMP Pass", false, false)
@@ -806,8 +806,8 @@ FunctionPass *llvm::createAArch64ConditionalCompares() {
 
 void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfo>();
   AU.addPreserved<MachineLoopInfo>();
   AU.addRequired<MachineTraceMetrics>();
@@ -933,7 +933,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   TRI = MF.getSubtarget().getRegisterInfo();
   SchedModel = MF.getSubtarget().getSchedModel();
   MRI = &MF.getRegInfo();
-  DomTree = &getAnalysis<MachineDominatorTree>();
+  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfo>();
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index cd532671f50189..cf617c7e92a70a 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -431,8 +431,16 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   uint64_t NumBytes = AFI->getLocalStackSize();
 
+  // If neither NEON or SVE are available, a COPY from one Q-reg to
+  // another requires a spill -> reload sequence. We can do that
+  // using a pre-decrementing store/post-decrementing load, but
+  // if we do so, we can't use the Red Zone.
+  bool LowerQRegCopyThroughMem = Subtarget.hasFPARMv8() &&
+                                 !Subtarget.isNeonAvailable() &&
+                                 !Subtarget.hasSVE();
+
   return !(MFI.hasCalls() || hasFP(MF) || NumBytes > RedZoneSize ||
-           getSVEStackSize(MF));
+           getSVEStackSize(MF) || LowerQRegCopyThroughMem);
 }
 
 /// hasFP - Return true if the specified function should have a dedicated frame
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 87d737d7ffe61c..c4f819f5fcdd29 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -728,14 +728,14 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FCOPYSIGN, MVT::bf16, Promote);
   }
 
-  for (auto Op : {ISD::FREM,         ISD::FPOW,         ISD::FPOWI,
-                  ISD::FCOS,         ISD::FSIN,         ISD::FSINCOS,
-                  ISD::FTAN,         ISD::FEXP,         ISD::FEXP2,
-                  ISD::FEXP10,       ISD::FLOG,         ISD::FLOG2,
-                  ISD::FLOG10,       ISD::STRICT_FREM,  ISD::STRICT_FPOW,
-                  ISD::STRICT_FPOWI, ISD::STRICT_FCOS,  ISD::STRICT_FSIN,
-                  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2, ISD::STRICT_FLOG,
-                  ISD::STRICT_FLOG2, ISD::STRICT_FLOG10}) {
+  for (auto Op : {ISD::FREM,         ISD::FPOW,          ISD::FPOWI,
+                  ISD::FCOS,         ISD::FSIN,          ISD::FSINCOS,
+                  ISD::FTAN,         ISD::FEXP,          ISD::FEXP2,
+                  ISD::FEXP10,       ISD::FLOG,          ISD::FLOG2,
+                  ISD::FLOG10,       ISD::STRICT_FREM,   ISD::STRICT_FPOW,
+                  ISD::STRICT_FPOWI, ISD::STRICT_FCOS,   ISD::STRICT_FSIN,
+                  ISD::STRICT_FEXP,  ISD::STRICT_FEXP2,  ISD::STRICT_FLOG,
+                  ISD::STRICT_FLOG2, ISD::STRICT_FLOG10, ISD::STRICT_FTAN}) {
     setOperationAction(Op, MVT::f16, Promote);
     setOperationAction(Op, MVT::v4f16, Expand);
     setOperationAction(Op, MVT::v8f16, Expand);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 270474f80767aa..322bde3da67631 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -1022,7 +1022,7 @@ static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
 
   if (!IsTailCall) {
     if (!PAI)
-      return IsIndirect ? getBLRCallOpcode(CallerF) : AArch64::BL;
+      return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL;
 
     assert(IsIndirect && "Direct call should not be authenticated");
     assert((PAI->Key == AArch64PACKey::IA || PAI->Key == AArch64PACKey::IB) &&
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
index 0c7be9f42c570d..f71fe323a6d358 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -524,8 +524,8 @@ void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
   if (!IsOptNone) {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addRequired<GISelCSEAnalysisWrapperPass>();
     AU.addPreserved<GISelCSEAnalysisWrapperPass>();
   }
@@ -557,7 +557,8 @@ bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
 
   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
   MachineDominatorTree *MDT =
-      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+      IsOptNone ? nullptr
+                : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   GISelCSEAnalysisWrapper &Wrapper =
       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
   auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 31f77be20f348b..e9b25924b35f73 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -823,8 +823,8 @@ void AArch64PreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   getSelectionDAGFallbackAnalysisUsage(AU);
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<GISelCSEAnalysisWrapperPass>();
   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -856,7 +856,8 @@ bool AArch64PreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
   bool EnableOpt =
       MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
-  MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree *MDT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
                      /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
                      F.hasMinSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index a0c6bf7cc31c0a..8c914382b1ecb2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -46,7 +46,7 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineUniformityAnalysisPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -192,7 +192,7 @@ void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) {
 
 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
                       "AMDGPU GlobalISel divergence lowering", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
@@ -209,7 +209,8 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
 
 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
     MachineFunction &MF) {
-  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree &DT =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
   MachineUniformityInfo &MUI =
       getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
index f36374b08b34d5..46d44704af5a7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -465,8 +465,8 @@ void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
   if (!IsOptNone) {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
   }
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -494,7 +494,8 @@ bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
 
   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
   MachineDominatorTree *MDT =
-      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+      IsOptNone ? nullptr
+                : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
                      LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
index 3f01a328afaf83..4d0cb467ba374d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -238,8 +238,8 @@ void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
   if (!IsOptNone) {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
   }
 
   AU.addRequired<GISelCSEAnalysisWrapperPass>();
@@ -272,7 +272,8 @@ bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
 
   const GCNSubtarget &STI = MF.getSubtarget<GCNSubtarget>();
   MachineDominatorTree *MDT =
-      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+      IsOptNone ? nullptr
+                : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
                      nullptr, EnableOpt, F.hasOptSize(), F.hasMinSize());
   AMDGPUPreLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, CSEInfo, RuleConfig,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
index 35abd6eddde851..74f0540239c939 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -421,8 +421,8 @@ void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
   if (!IsOptNone) {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
   }
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -449,7 +449,8 @@ bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
 
   const auto *LI = ST.getLegalizerInfo();
   MachineDominatorTree *MDT =
-      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+      IsOptNone ? nullptr
+                : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
                      LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
index 2ea03ddb1fccd6..d1985f46b1c448 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankSelect.cpp
@@ -33,7 +33,7 @@ StringRef AMDGPURegBankSelect::getPassName() const {
 
 void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineCycleInfoWrapperPass>();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   // TODO: Preserve DomTree
   RegBankSelect::getAnalysisUsage(AU);
 }
@@ -41,7 +41,7 @@ void AMDGPURegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
 INITIALIZE_PASS_BEGIN(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
                       "AMDGPU Register Bank Select", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineCycleInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(AMDGPURegBankSelect, "amdgpu-" DEBUG_TYPE,
                     "AMDGPU Register Bank Select", false, false)
 
@@ -63,7 +63,8 @@ bool AMDGPURegBankSelect::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   MachineCycleInfo &CycleInfo =
       getAnalysis<MachineCycleInfoWrapperPass>().getCycleInfo();
-  MachineDominatorTree &DomTree = getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree &DomTree =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   MachineUniformityInfo Uniformity =
       computeMachineUniformityInfo(MF, CycleInfo, DomTree.getBase(),
diff --git a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
index 0a96c643d9bdc6..b35f59bc5ba30b 100644
--- a/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600MachineCFGStructurizer.cpp
@@ -113,7 +113,7 @@ class R600MachineCFGStructurizer : public MachineFunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -140,8 +140,8 @@ class R600MachineCFGStructurizer : public MachineFunctionPass {
     FuncRep = &MF;
     MLI = &getAnalysis<MachineLoopInfo>();
     LLVM_DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
-    MDT = &getAnalysis<MachineDominatorTree>();
-    LLVM_DEBUG(MDT->print(dbgs(), (const Module *)nullptr););
+    MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+    LLVM_DEBUG(MDT->print(dbgs()););
     PDT = &getAnalysis<MachinePostDominatorTree>();
     LLVM_DEBUG(PDT->print(dbgs()););
     prepare();
@@ -1629,7 +1629,7 @@ void R600MachineCFGStructurizer::retireBlock(MachineBasicBlock *MBB) {
 
 INITIALIZE_PASS_BEGIN(R600MachineCFGStructurizer, "amdgpustructurizer",
                       "AMDGPU CFG Structurizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(R600MachineCFGStructurizer, "amdgpustructurizer",
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index 77935cb4cde1a9..8bac570d59d4a6 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -103,8 +103,8 @@ class R600VectorRegMerger : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachineLoopInfo>();
     AU.addPreserved<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
index 59e27478759096..64185db02ec1de 100644
--- a/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600Packetizer.cpp
@@ -35,8 +35,8 @@ class R600Packetizer : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachineLoopInfo>();
     AU.addPreserved<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index a00ca625fc7390..68c5f23c8e11f3 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -162,8 +162,8 @@ class SIFixSGPRCopies : public MachineFunctionPass {
   StringRef getPassName() const override { return "SI Fix SGPR copies"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -173,7 +173,7 @@ class SIFixSGPRCopies : public MachineFunctionPass {
 
 INITIALIZE_PASS_BEGIN(SIFixSGPRCopies, DEBUG_TYPE,
                      "SI Fix SGPR copies", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SIFixSGPRCopies, DEBUG_TYPE,
                      "SI Fix SGPR copies", false, false)
 
@@ -611,8 +611,7 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   TRI = ST.getRegisterInfo();
   TII = ST.getInstrInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
-
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
                                                   BI != BE; ++BI) {
diff --git a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
index abb72e8e63c336..afc6353ec81167 100644
--- a/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SILateBranchLowering.cpp
@@ -48,8 +48,8 @@ class SILateBranchLowering : public MachineFunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -60,7 +60,7 @@ char SILateBranchLowering::ID = 0;
 
 INITIALIZE_PASS_BEGIN(SILateBranchLowering, DEBUG_TYPE,
                       "SI insert s_cbranch_execz instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(SILateBranchLowering, DEBUG_TYPE,
                     "SI insert s_cbranch_execz instructions", false, false)
 
@@ -149,7 +149,7 @@ bool SILateBranchLowering::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   MovOpc = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
   ExecReg = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 5dc3457b5bfae1..75a1575f2180ea 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -149,7 +149,7 @@ class SILowerControlFlow : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addUsedIfAvailable<LiveIntervals>();
     // Should preserve the same set that TwoAddressInstructions does.
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
     AU.addPreservedID(LiveVariablesID);
@@ -764,7 +764,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   // This doesn't actually need LiveVariables, but we can preserve them.
   LV = getAnalysisIfAvailable<LiveVariables>();
-  MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+  MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
   MRI = &MF.getRegInfo();
   BoolRC = TRI->getBoolRC();
 
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 32dad0c425c041..9f0a9e03701b48 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -51,7 +51,7 @@ class SILowerI1Copies : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -399,7 +399,7 @@ class LoopFinder {
 
 INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
                       false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
                     false)
@@ -445,8 +445,9 @@ bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
           MachineFunctionProperties::Property::Selected))
     return false;
 
-  Vreg1LoweringHelper Helper(&TheMF, &getAnalysis<MachineDominatorTree>(),
-                             &getAnalysis<MachinePostDominatorTree>());
+  Vreg1LoweringHelper Helper(
+      &TheMF, &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree(),
+      &getAnalysis<MachinePostDominatorTree>());
 
   bool Changed = false;
   Changed |= Helper.lowerCopiesFromI1();
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
index 8204a70e72d916..18d66e4191522e 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeVGPRLiveRange.cpp
@@ -148,10 +148,10 @@ class SIOptimizeVGPRLiveRange : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveVariables>();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachineLoopInfo>();
     AU.addPreserved<LiveVariables>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addPreserved<MachineLoopInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -618,7 +618,7 @@ char SIOptimizeVGPRLiveRange::ID = 0;
 
 INITIALIZE_PASS_BEGIN(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
                       "SI Optimize VGPR LiveRange", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_END(SIOptimizeVGPRLiveRange, DEBUG_TYPE,
@@ -635,7 +635,7 @@ bool SIOptimizeVGPRLiveRange::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfo>();
   LV = &getAnalysis<LiveVariables>();
   MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 64f153b4807973..b4624ad9a52371 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3312,7 +3312,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
                                               MachineInstr &Use,
                                               MachineRegisterInfo &MRI,
                                               LiveIntervals *LIS) const {
-  auto &MDT = LIS->getAnalysis<MachineDominatorTree>();
+  auto &MDT = LIS->getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
   SlotIndex DefIdx;
 
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 913942dda19d91..811cd410fe31c6 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -241,7 +241,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addPreserved<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -259,7 +259,7 @@ char SIWholeQuadMode::ID = 0;
 INITIALIZE_PASS_BEGIN(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(SIWholeQuadMode, DEBUG_TYPE, "SI Whole Quad Mode", false,
                     false)
@@ -1687,7 +1687,8 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
   LIS = &getAnalysis<LiveIntervals>();
-  MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  auto *MDTWrapper = getAnalysisIfAvailable<MachineDominatorTreeWrapperPass>();
+  MDT = MDTWrapper ? &MDTWrapper->getDomTree() : nullptr;
   PDT = getAnalysisIfAvailable<MachinePostDominatorTree>();
 
   if (ST->isWave32()) {
diff --git a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
index 0e3e4d34aa6a14..9d616e103f1717 100644
--- a/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
+++ b/llvm/lib/Target/ARC/ARCBranchFinalize.cpp
@@ -61,7 +61,7 @@ char ARCBranchFinalize::ID = 0;
 
 INITIALIZE_PASS_BEGIN(ARCBranchFinalize, "arc-branch-finalize",
                       "ARC finalize branches", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(ARCBranchFinalize, "arc-branch-finalize",
                     "ARC finalize branches", false, false)
 
diff --git a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
index e7a0b352db8d97..36f811c0aa003f 100644
--- a/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
+++ b/llvm/lib/Target/ARC/ARCOptAddrMode.cpp
@@ -60,8 +60,8 @@ class ARCOptAddrMode : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -119,7 +119,7 @@ class ARCOptAddrMode : public MachineFunctionPass {
 char ARCOptAddrMode::ID = 0;
 INITIALIZE_PASS_BEGIN(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
                       false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(ARCOptAddrMode, OPTADDRMODE_NAME, OPTADDRMODE_DESC, false,
                     false)
 
@@ -508,7 +508,7 @@ bool ARCOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   AST = &MF.getSubtarget<ARCSubtarget>();
   AII = AST->getInstrInfo();
   MRI = &MF.getRegInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   bool Changed = false;
   for (auto &MBB : MF)
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 9579053943f9f0..90f5c6c40b49c6 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -229,7 +229,7 @@ namespace {
     bool runOnMachineFunction(MachineFunction &MF) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -399,7 +399,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   isPositionIndependentOrROPI =
       STI->getTargetLowering()->isPositionIndependent() || STI->isROPI();
   AFI = MF->getInfo<ARMFunctionInfo>();
-  DT = &getAnalysis<MachineDominatorTree>();
+  DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   isThumb = AFI->isThumbFunction();
   isThumb1 = AFI->isThumb1OnlyFunction();
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 4a5b672f862bec..e5e817f1ed9a29 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -2161,8 +2161,8 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -2186,7 +2186,7 @@ char ARMPreAllocLoadStoreOpt::ID = 0;
 
 INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
                       ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
                     ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
 
@@ -2204,7 +2204,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
-  DT = &getAnalysis<MachineDominatorTree>();
+  DT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MF  = &Fn;
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index de7449a400a741..b6cecccf3572dd 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -329,7 +329,7 @@ void ARMELFObjectWriter::addTargetSectionFlags(MCContext &Ctx,
   MCSectionELF *TextSection =
       static_cast<MCSectionELF *>(Ctx.getObjectFileInfo()->getTextSection());
   if (Sec.getKind().isExecuteOnly() && !TextSection->hasInstructions()) {
-    for (auto &F : TextSection->getFragmentList())
+    for (auto &F : *TextSection)
       if (auto *DF = dyn_cast<MCDataFragment>(&F))
         if (!DF->getContents().empty())
           return;
diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
index c9bbc41ac13bac..4882e8533caf1f 100644
--- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
+++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp
@@ -59,8 +59,8 @@ class MVETPAndVPTOptimisations : public MachineFunctionPass {
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineLoopInfo>();
     AU.addPreserved<MachineLoopInfo>();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -93,7 +93,7 @@ INITIALIZE_PASS_BEGIN(MVETPAndVPTOptimisations, DEBUG_TYPE,
                       "ARM MVE TailPred and VPT Optimisations pass", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(MVETPAndVPTOptimisations, DEBUG_TYPE,
                     "ARM MVE TailPred and VPT Optimisations pass", false, false)
 
@@ -1065,7 +1065,8 @@ bool MVETPAndVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
   TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
   MRI = &Fn.getRegInfo();
   MachineLoopInfo *MLI = &getAnalysis<MachineLoopInfo>();
-  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree *DT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
                     << "********** Function: " << Fn.getName() << '\n');
diff --git a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
index 4acdd571f6c968..97bdd4c45a8c68 100644
--- a/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
+++ b/llvm/lib/Target/CSKY/CSKYConstantIslandPass.cpp
@@ -218,7 +218,7 @@ class CSKYConstantIslands : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 4c18e076c43936..99745941d57986 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -219,8 +219,8 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -285,7 +285,7 @@ char HexagonBitSimplify::ID = 0;
 
 INITIALIZE_PASS_BEGIN(HexagonBitSimplify, "hexagon-bit-simplify",
       "Hexagon bit simplification", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(HexagonBitSimplify, "hexagon-bit-simplify",
       "Hexagon bit simplification", false, false)
 
@@ -2800,7 +2800,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
   auto &HRI = *HST.getRegisterInfo();
   auto &HII = *HST.getInstrInfo();
 
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   bool Changed;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index f2a02fe9540bfa..f0933765bbcbda 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -218,8 +218,8 @@ namespace {
     HexagonConstExtenders() : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -569,7 +569,7 @@ namespace {
 
 INITIALIZE_PASS_BEGIN(HexagonConstExtenders, "hexagon-cext-opt",
       "Hexagon constant-extender optimization", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(HexagonConstExtenders, "hexagon-cext-opt",
       "Hexagon constant-extender optimization", false, false)
 
@@ -1973,7 +1973,7 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
   HST = &MF.getSubtarget<HexagonSubtarget>();
   HII = HST->getInstrInfo();
   HRI = HST->getRegisterInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MRI = &MF.getRegInfo();
   AssignmentMap IMap;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
index 97917270601bc8..a5c47e67de892f 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyHoisting.cpp
@@ -50,8 +50,8 @@ class HexagonCopyHoisting : public MachineFunctionPass {
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index cb820e2158992d..03f6882e6889f1 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -162,8 +162,8 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -1054,7 +1054,7 @@ bool HexagonEarlyIfConversion::runOnMachineFunction(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
   MFN = &MF;
   MRI = &MF.getRegInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MLI = &getAnalysis<MachineLoopInfo>();
   MBPI = EnableHexagonBP ? &getAnalysis<MachineBranchProbabilityInfo>() :
     nullptr;
diff --git a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 204f3b6b20c751..8a23b7743e8395 100644
--- a/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -155,8 +155,8 @@ namespace {
       AU.addRequired<LiveIntervals>();
       AU.addPreserved<LiveIntervals>();
       AU.addPreserved<SlotIndexes>();
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -254,7 +254,7 @@ namespace llvm {
 
 INITIALIZE_PASS_BEGIN(HexagonExpandCondsets, "expand-condsets",
   "Hexagon Expand Condsets", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(HexagonExpandCondsets, "expand-condsets",
@@ -1277,7 +1277,7 @@ bool HexagonExpandCondsets::runOnMachineFunction(MachineFunction &MF) {
 
   HII = static_cast<const HexagonInstrInfo*>(MF.getSubtarget().getInstrInfo());
   TRI = MF.getSubtarget().getRegisterInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   LIS = &getAnalysis<LiveIntervals>();
   MRI = &MF.getRegInfo();
 
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 232651132d6e4f..a5d290a61f3283 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -413,7 +413,7 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
 
   MachineDominatorTree MDT;
-  MDT.runOnMachineFunction(MF);
+  MDT.calculate(MF);
   MachinePostDominatorTree MPT;
   MPT.runOnMachineFunction(MF);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
index 1e373f6061bbfc..a4304b0531666a 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -515,8 +515,8 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -1497,7 +1497,7 @@ bool HexagonGenInsert::runOnMachineFunction(MachineFunction &MF) {
   HRI = ST.getRegisterInfo();
   MFN = &MF;
   MRI = &MF.getRegInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   // Clean up before any further processing, so that dead code does not
   // get used in a newly generated "insert" instruction. Have a custom
@@ -1607,6 +1607,6 @@ FunctionPass *llvm::createHexagonGenInsert() {
 
 INITIALIZE_PASS_BEGIN(HexagonGenInsert, "hexinsert",
   "Hexagon generate \"insert\" instructions", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(HexagonGenInsert, "hexinsert",
   "Hexagon generate \"insert\" instructions", false, false)
diff --git a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
index afd49631943f26..651ccc2db9ba24 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenMemAbsolute.cpp
@@ -56,8 +56,8 @@ class HexagonGenMemAbsolute : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
   }
 
   bool runOnMachineFunction(MachineFunction &Fn) override;
@@ -82,7 +82,8 @@ bool HexagonGenMemAbsolute::runOnMachineFunction(MachineFunction &Fn) {
   MRI = &Fn.getRegInfo();
   TRI = Fn.getRegInfo().getTargetRegisterInfo();
 
-  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree &MDT =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   // Loop over all of the basic blocks
   for (MachineFunction::iterator MBBb = Fn.begin(), MBBe = Fn.end();
diff --git a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
index 92e74327361150..5bb2d7d80ad541 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp
@@ -93,8 +93,8 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -130,7 +130,7 @@ char HexagonGenPredicate::ID = 0;
 
 INITIALIZE_PASS_BEGIN(HexagonGenPredicate, "hexagon-gen-pred",
   "Hexagon generate predicate operations", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(HexagonGenPredicate, "hexagon-gen-pred",
   "Hexagon generate predicate operations", false, false)
 
diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 31e37dcce415f1..19a024078b1045 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -118,7 +118,7 @@ namespace {
     StringRef getPassName() const override { return "Hexagon Hardware Loops"; }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -368,7 +368,7 @@ namespace {
 
 INITIALIZE_PASS_BEGIN(HexagonHardwareLoops, "hwloops",
                       "Hexagon Hardware Loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops",
                     "Hexagon Hardware Loops", false, false)
@@ -386,7 +386,7 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
 
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
   TII = HST.getInstrInfo();
   TRI = HST.getRegisterInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 0e82bf6e5331da..e7f5c257b21c1f 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -70,7 +70,7 @@ class HexagonOptAddrMode : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachineDominanceFrontier>();
     AU.setPreservesAll();
   }
@@ -122,7 +122,7 @@ char HexagonOptAddrMode::ID = 0;
 
 INITIALIZE_PASS_BEGIN(HexagonOptAddrMode, "amode-opt",
                       "Optimize addressing mode", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
 INITIALIZE_PASS_END(HexagonOptAddrMode, "amode-opt", "Optimize addressing mode",
                     false, false)
@@ -872,7 +872,7 @@ bool HexagonOptAddrMode::runOnMachineFunction(MachineFunction &MF) {
   HII = HST.getInstrInfo();
   HRI = HST.getRegisterInfo();
   const auto &MDF = getAnalysis<MachineDominanceFrontier>();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   DataFlowGraph G(MF, *HII, *HRI, *MDT, MDF);
   // Need to keep dead phis because we can propagate uses of registers into
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 4131f2a31755fb..3c17f680011496 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -63,7 +63,7 @@ namespace {
     HexagonRDFOpt() : MachineFunctionPass(ID) {}
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineDominanceFrontier>();
       AU.setPreservesAll();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -109,7 +109,7 @@ char HexagonRDFOpt::ID = 0;
 
 INITIALIZE_PASS_BEGIN(HexagonRDFOpt, "hexagon-rdf-opt",
       "Hexagon RDF optimizations", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
 INITIALIZE_PASS_END(HexagonRDFOpt, "hexagon-rdf-opt",
       "Hexagon RDF optimizations", false, false)
@@ -302,7 +302,7 @@ bool HexagonRDFOpt::runOnMachineFunction(MachineFunction &MF) {
     RDFCount++;
   }
 
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   const auto &MDF = getAnalysis<MachineDominanceFrontier>();
   const auto &HII = *MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
   const auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 56472d633694ae..2d5352b08caed7 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -97,9 +97,9 @@ namespace {
       AU.setPreservesCFG();
       AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<MachineBranchProbabilityInfo>();
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
       AU.addRequired<MachineLoopInfo>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       AU.addPreserved<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -124,7 +124,7 @@ char HexagonPacketizer::ID = 0;
 
 INITIALIZE_PASS_BEGIN(HexagonPacketizer, "hexagon-packetizer",
                       "Hexagon Packetizer", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index f9a0ba3608e6dc..3b6ea81cdf10ed 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -713,15 +713,14 @@ class HexagonAsmBackend : public MCAsmBackend {
   void finishLayout(MCAssembler const &Asm,
                     MCAsmLayout &Layout) const override {
     for (auto *I : Layout.getSectionOrder()) {
-      auto &Fragments = I->getFragmentList();
-      for (auto &J : Fragments) {
+      for (auto &J : *I) {
         switch (J.getKind()) {
         default:
           break;
         case MCFragment::FT_Align: {
           auto Size = Asm.computeFragmentSize(Layout, J);
           for (auto K = J.getIterator();
-               K != Fragments.begin() && Size >= HEXAGON_PACKET_SIZE;) {
+               K != I->begin() && Size >= HEXAGON_PACKET_SIZE;) {
             --K;
             switch (K->getKind()) {
             default:
diff --git a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
index 902c7ceb869ae3..7ceb97642bba13 100644
--- a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -83,7 +83,7 @@ class OptimizePICCall : public MachineFunctionPass {
   bool runOnMachineFunction(MachineFunction &F) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -197,7 +197,8 @@ bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
     return false;
 
   // Do a pre-order traversal of the dominator tree.
-  MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree *MDT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   bool Changed = false;
 
   SmallVector<MBBInfo, 8> WorkList(1, MBBInfo(MDT->getRootNode()));
diff --git a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
index 0578655f0443a6..bd8a065011c928 100644
--- a/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPostLegalizerCombiner.cpp
@@ -110,8 +110,8 @@ void MipsPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
   if (!IsOptNone) {
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
   }
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -139,7 +139,8 @@ bool MipsPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
 
   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
   MachineDominatorTree *MDT =
-      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+      IsOptNone ? nullptr
+                : &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   CombinerInfo CInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
                      LI, EnableOpt, F.hasOptSize(), F.hasMinSize());
   MipsPostLegalizerCombinerImpl Impl(MF, CInfo, TPC, *KB, /*CSEInfo*/ nullptr,
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 82770f8660850c..ca077d41d36bac 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1845,6 +1845,10 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   auto AddIntToBuffer = [AggBuffer, Bytes](const APInt &Val) {
     size_t NumBytes = (Val.getBitWidth() + 7) / 8;
     SmallVector<unsigned char, 16> Buf(NumBytes);
+    // `extractBitsAsZExtValue` does not allow the extraction of bits beyond the
+    // input's bit width, and i1 arrays may not have a length that is a multuple
+    // of 8. We handle the last byte separately, so we never request out of
+    // bounds bits.
     for (unsigned I = 0; I < NumBytes - 1; ++I) {
       Buf[I] = Val.extractBitsAsZExtValue(8, I * 8);
     }
diff --git a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 799890928577c4..bf632801646484 100644
--- a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -165,7 +165,7 @@ class PPCBranchCoalescing : public MachineFunctionPass {
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachinePostDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -195,7 +195,7 @@ FunctionPass *llvm::createPPCBranchCoalescingPass() {
 
 INITIALIZE_PASS_BEGIN(PPCBranchCoalescing, DEBUG_TYPE,
                       "Branch Coalescing", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_END(PPCBranchCoalescing, DEBUG_TYPE, "Branch Coalescing",
                     false, false)
@@ -214,7 +214,7 @@ void PPCBranchCoalescing::CoalescingCandidateInfo::clear() {
 }
 
 void PPCBranchCoalescing::initialize(MachineFunction &MF) {
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MPDT = &getAnalysis<MachinePostDominatorTree>();
   TII = MF.getSubtarget().getInstrInfo();
   MRI = &MF.getRegInfo();
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
index 1f9947f6f32711..c4190bb9a1c4ed 100644
--- a/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoopsVerify.cpp
@@ -55,7 +55,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
@@ -70,7 +70,7 @@ namespace {
 
 INITIALIZE_PASS_BEGIN(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
                       "PowerPC CTR Loops Verify", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(PPCCTRLoopsVerify, "ppc-ctr-loops-verify",
                     "PowerPC CTR Loops Verify", false, false)
 
@@ -160,7 +160,7 @@ static bool verifyCTRBranch(MachineBasicBlock *MBB,
 }
 
 bool PPCCTRLoopsVerify::runOnMachineFunction(MachineFunction &MF) {
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
 
   // Verify that all bdnz/bdz instructions are dominated by a loop mtctr before
   // any other instructions that might clobber the ctr register.
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index c6db8a7bbeb855..c57b48055d2ad7 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -156,11 +156,11 @@ struct PPCMIPeephole : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<LiveVariables>();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineBlockFrequencyInfo>();
     AU.addPreserved<LiveVariables>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addPreserved<MachinePostDominatorTree>();
     AU.addPreserved<MachineBlockFrequencyInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -200,7 +200,7 @@ void PPCMIPeephole::addRegToUpdateWithLine(Register Reg, int Line) {
 void PPCMIPeephole::initialize(MachineFunction &MFParm) {
   MF = &MFParm;
   MRI = &MF->getRegInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   MPDT = &getAnalysis<MachinePostDominatorTree>();
   MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
   LV = &getAnalysis<LiveVariables>();
@@ -2029,7 +2029,7 @@ bool PPCMIPeephole::combineSEXTAndSHL(MachineInstr &MI,
 INITIALIZE_PASS_BEGIN(PPCMIPeephole, DEBUG_TYPE,
                       "PowerPC MI Peephole Optimization", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(LiveVariables)
 INITIALIZE_PASS_END(PPCMIPeephole, DEBUG_TYPE,
diff --git a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
index 0504db239f6713..d1cc2ad5c481ff 100644
--- a/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
+++ b/llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp
@@ -427,7 +427,7 @@ class PPCReduceCRLogicals : public MachineFunctionPass {
   CRLogicalOpInfo createCRLogicalOpInfo(MachineInstr &MI);
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -730,7 +730,7 @@ void PPCReduceCRLogicals::collectCRLogicals() {
 
 INITIALIZE_PASS_BEGIN(PPCReduceCRLogicals, DEBUG_TYPE,
                       "PowerPC Reduce CR logical Operation", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(PPCReduceCRLogicals, DEBUG_TYPE,
                     "PowerPC Reduce CR logical Operation", false, false)
 
diff --git a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
index 0d8c71f9f2e699..69e046972f3d43 100644
--- a/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
+++ b/llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp
@@ -368,8 +368,8 @@ namespace {
       AU.addPreserved<LiveIntervals>();
       AU.addRequired<SlotIndexes>();
       AU.addPreserved<SlotIndexes>();
-      AU.addRequired<MachineDominatorTree>();
-      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineDominatorTreeWrapperPass>();
+      AU.addPreserved<MachineDominatorTreeWrapperPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
@@ -379,7 +379,7 @@ INITIALIZE_PASS_BEGIN(PPCVSXFMAMutate, DEBUG_TYPE,
                       "PowerPC VSX FMA Mutation", false, false)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(PPCVSXFMAMutate, DEBUG_TYPE,
                     "PowerPC VSX FMA Mutation", false, false)
 
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 5906a2cdb3bfa1..8ac1cdf0a7a9ce 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2155,6 +2155,16 @@ bool RISCVAsmParser::parseVTypeToken(const AsmToken &Tok, VTypeState &State,
       break;
     if (!RISCVVType::isValidLMUL(Lmul, Fractional))
       break;
+
+    if (Fractional) {
+      unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32;
+      unsigned MinLMUL = ELEN / 8;
+      if (Lmul > MinLMUL)
+        Warning(Tok.getLoc(),
+                "use of vtype encodings with LMUL < SEWMIN/ELEN == mf" +
+                    Twine(MinLMUL) + " is reserved");
+    }
+
     State = VTypeState_TailPolicy;
     return false;
   }
@@ -2194,6 +2204,7 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
   bool MaskAgnostic = false;
 
   VTypeState State = VTypeState_SEW;
+  SMLoc SEWLoc = S;
 
   if (parseVTypeToken(getTok(), State, Sew, Lmul, Fractional, TailAgnostic,
                       MaskAgnostic))
@@ -2211,6 +2222,16 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
 
   if (getLexer().is(AsmToken::EndOfStatement) && State == VTypeState_Done) {
     RISCVII::VLMUL VLMUL = RISCVVType::encodeLMUL(Lmul, Fractional);
+    if (Fractional) {
+      unsigned ELEN = STI->hasFeature(RISCV::FeatureStdExtZve64x) ? 64 : 32;
+      unsigned MaxSEW = ELEN / Lmul;
+      // If MaxSEW < 8, we should have printed warning about reserved LMUL.
+      if (MaxSEW >= 8 && Sew > MaxSEW)
+        Warning(SEWLoc,
+                "use of vtype encodings with SEW > " + Twine(MaxSEW) +
+                    " and LMUL == mf" + Twine(Lmul) +
+                    " may not be compatible with all RVV implementations");
+    }
 
     unsigned VTypeI =
         RISCVVType::encodeVTYPE(VLMUL, Sew, TailAgnostic, MaskAgnostic);
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
index 9c28944abc7672..8fa9dba2853876 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
@@ -112,8 +112,8 @@ void RISCVPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   getSelectionDAGFallbackAnalysisUsage(AU);
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<GISelCSEAnalysisWrapperPass>();
   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -143,7 +143,8 @@ bool RISCVPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
   const auto *LI = ST.getLegalizerInfo();
 
   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
-  MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree *MDT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   GISelCSEAnalysisWrapper &Wrapper =
       getAnalysis<GISelCSEAnalysisWrapperPass>().getCSEWrapper();
   auto *CSEInfo = &Wrapper.get(TPC->getCSEConfig());
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
index 9a35fffae05890..6a695119be25a2 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPreLegalizerCombiner.cpp
@@ -109,8 +109,8 @@ void RISCVPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   getSelectionDAGFallbackAnalysisUsage(AU);
   AU.addRequired<GISelKnownBitsAnalysis>();
   AU.addPreserved<GISelKnownBitsAnalysis>();
-  AU.addRequired<MachineDominatorTree>();
-  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
+  AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<GISelCSEAnalysisWrapperPass>();
   AU.addPreserved<GISelCSEAnalysisWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -142,7 +142,8 @@ bool RISCVPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
   bool EnableOpt =
       MF.getTarget().getOptLevel() != CodeGenOptLevel::None && !skipFunction(F);
   GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
-  MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree *MDT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   CombinerInfo CInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
                      /*LegalizerInfo*/ nullptr, EnableOpt, F.hasOptSize(),
                      F.hasMinSize());
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 9bf06850483d84..011edca019fd60 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -477,6 +477,14 @@ def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
 
 // Bitmanip Extensions for Cryptography Extensions
 
+def FeatureStdExtB
+    : RISCVExtension<"b", 1, 0,
+                     "'B' (the collection of the Zba, Zbb, Zbs extensions)",
+                     [FeatureStdExtZba, FeatureStdExtZbb, FeatureStdExtZbs]>;
+def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">,
+                           AssemblerPredicate<(all_of FeatureStdExtB),
+                           "'B' (the collection of the Zba, Zbb, Zbs extensions)">;
+
 def FeatureStdExtZbkb
     : RISCVExtension<"zbkb", 1, 0,
                      "'Zbkb' (Bitmanip instructions for Cryptography)">;
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 2744c25d1bc754..52354281cdd7e3 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -17,6 +17,8 @@
 #include "SPIRVSubtarget.h"
 #include "SPIRVTargetMachine.h"
 #include "SPIRVUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/IR/CFG.h"
@@ -71,7 +73,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
   /// terminator will take.
   llvm::Value *createExitVariable(
       BasicBlock *BB,
-      const std::unordered_map<BasicBlock *, ConstantInt *> &TargetToValue) {
+      const DenseMap<BasicBlock *, ConstantInt *> &TargetToValue) {
     auto *T = BB->getTerminator();
     if (isa<ReturnInst>(T))
       return nullptr;
@@ -103,7 +105,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
 
   /// Replaces |BB|'s branch targets present in |ToReplace| with |NewTarget|.
   void replaceBranchTargets(BasicBlock *BB,
-                            const std::unordered_set<BasicBlock *> ToReplace,
+                            const SmallPtrSet<BasicBlock *, 4> &ToReplace,
                             BasicBlock *NewTarget) {
     auto *T = BB->getTerminator();
     if (isa<ReturnInst>(T))
@@ -133,7 +135,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
   bool runOnConvergenceRegionNoRecurse(LoopInfo &LI,
                                        const SPIRV::ConvergenceRegion *CR) {
     // Gather all the exit targets for this region.
-    std::unordered_set<BasicBlock *> ExitTargets;
+    SmallPtrSet<BasicBlock *, 4> ExitTargets;
     for (BasicBlock *Exit : CR->Exits) {
       for (BasicBlock *Target : gatherSuccessors(Exit)) {
         if (CR->Blocks.count(Target) == 0)
@@ -164,9 +166,10 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
 
     // Creating one constant per distinct exit target. This will be route to the
     // correct target.
-    std::unordered_map<BasicBlock *, ConstantInt *> TargetToValue;
+    DenseMap<BasicBlock *, ConstantInt *> TargetToValue;
     for (BasicBlock *Target : SortedExitTargets)
-      TargetToValue.emplace(Target, Builder.getInt32(TargetToValue.size()));
+      TargetToValue.insert(
+          std::make_pair(Target, Builder.getInt32(TargetToValue.size())));
 
     // Creating one variable per exit node, set to the constant matching the
     // targeted external block.
@@ -184,12 +187,12 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
     }
 
     // Creating the switch to jump to the correct exit target.
-    std::vector<std::pair<BasicBlock *, ConstantInt *>> CasesList(
-        TargetToValue.begin(), TargetToValue.end());
-    llvm::SwitchInst *Sw =
-        Builder.CreateSwitch(node, CasesList[0].first, CasesList.size() - 1);
-    for (size_t i = 1; i < CasesList.size(); i++)
-      Sw->addCase(CasesList[i].second, CasesList[i].first);
+    llvm::SwitchInst *Sw = Builder.CreateSwitch(node, SortedExitTargets[0],
+                                                SortedExitTargets.size() - 1);
+    for (size_t i = 1; i < SortedExitTargets.size(); i++) {
+      BasicBlock *BB = SortedExitTargets[i];
+      Sw->addCase(TargetToValue[BB], BB);
+    }
 
     // Fix exit branches to redirect to the new exit.
     for (auto Exit : CR->Exits)
diff --git a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
index 8073ed0e2a3c87..bf8d109ff71f36 100644
--- a/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZLDCleanup.cpp
@@ -58,7 +58,7 @@ FunctionPass *llvm::createSystemZLDCleanupPass(SystemZTargetMachine &TM) {
 
 void SystemZLDCleanup::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -75,7 +75,8 @@ bool SystemZLDCleanup::runOnMachineFunction(MachineFunction &F) {
     return false;
   }
 
-  MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+  MachineDominatorTree *DT =
+      &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   return VisitNode(DT->getRootNode(), 0);
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index 06758e46519727..f746bf4307a08a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -53,8 +53,8 @@ class WebAssemblyCFGSort final : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachineLoopInfo>();
     AU.addPreserved<MachineLoopInfo>();
     AU.addRequired<WebAssemblyExceptionInfo>();
@@ -387,7 +387,7 @@ bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
 
   const auto &MLI = getAnalysis<MachineLoopInfo>();
   const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
-  auto &MDT = getAnalysis<MachineDominatorTree>();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   // Liveness is not tracked for VALUE_STACK physreg.
   MF.getRegInfo().invalidateLiveness();
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index d8cbddf74545da..77e82a32545f1c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -48,7 +48,7 @@ class WebAssemblyCFGStackify final : public MachineFunctionPass {
   StringRef getPassName() const override { return "WebAssembly CFG Stackify"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<WebAssemblyExceptionInfo>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -252,7 +252,7 @@ void WebAssemblyCFGStackify::unregisterScope(MachineInstr *Begin) {
 void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
   assert(!MBB.isEHPad());
   MachineFunction &MF = *MBB.getParent();
-  auto &MDT = getAnalysis<MachineDominatorTree>();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   const auto &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
 
@@ -465,7 +465,7 @@ void WebAssemblyCFGStackify::placeLoopMarker(MachineBasicBlock &MBB) {
 void WebAssemblyCFGStackify::placeTryMarker(MachineBasicBlock &MBB) {
   assert(MBB.isEHPad());
   MachineFunction &MF = *MBB.getParent();
-  auto &MDT = getAnalysis<MachineDominatorTree>();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   const auto &MLI = getAnalysis<MachineLoopInfo>();
   const auto &WEI = getAnalysis<WebAssemblyExceptionInfo>();
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
index 8deac76b2bc3d9..f23f21c8f69fb9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -31,7 +31,7 @@ char WebAssemblyExceptionInfo::ID = 0;
 
 INITIALIZE_PASS_BEGIN(WebAssemblyExceptionInfo, DEBUG_TYPE,
                       "WebAssembly Exception Information", true, true)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
 INITIALIZE_PASS_END(WebAssemblyExceptionInfo, DEBUG_TYPE,
                     "WebAssembly Exception Information", true, true)
@@ -45,7 +45,7 @@ bool WebAssemblyExceptionInfo::runOnMachineFunction(MachineFunction &MF) {
           ExceptionHandling::Wasm ||
       !MF.getFunction().hasPersonalityFn())
     return false;
-  auto &MDT = getAnalysis<MachineDominatorTree>();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   auto &MDF = getAnalysis<MachineDominanceFrontier>();
   recalculate(MF, MDT, MDF);
   LLVM_DEBUG(dump());
@@ -273,7 +273,7 @@ void WebAssemblyExceptionInfo::releaseMemory() {
 
 void WebAssemblyExceptionInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineDominanceFrontier>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
index 2180f57c106a7f..2ab5bcdd838d00 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
@@ -56,8 +56,8 @@ class WebAssemblyMemIntrinsicResults final : public MachineFunctionPass {
     AU.setPreservesCFG();
     AU.addRequired<MachineBlockFrequencyInfo>();
     AU.addPreserved<MachineBlockFrequencyInfo>();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
@@ -180,7 +180,7 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
   });
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  auto &MDT = getAnalysis<MachineDominatorTree>();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   const WebAssemblyTargetLowering &TLI =
       *MF.getSubtarget<WebAssemblySubtarget>().getTargetLowering();
   const auto &LibInfo =
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index d4edb6bf18d932..e38905c20b8390 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -48,13 +48,13 @@ class WebAssemblyRegStackify final : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     AU.addRequired<LiveIntervals>();
     AU.addPreserved<MachineBlockFrequencyInfo>();
     AU.addPreserved<SlotIndexes>();
     AU.addPreserved<LiveIntervals>();
     AU.addPreservedID(LiveVariablesID);
-    AU.addPreserved<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
@@ -813,7 +813,7 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
   const auto *TRI = MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
-  auto &MDT = getAnalysis<MachineDominatorTree>();
+  auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   auto &LIS = getAnalysis<LiveIntervals>();
 
   // Walk the instructions from the bottom up. Currently we don't look past
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index d9936557776ba1..20e50c8c9e1ae0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -201,6 +201,9 @@ struct RuntimeLibcallSignatureTable {
     Table[RTLIB::COS_F32] = f32_func_f32;
     Table[RTLIB::COS_F64] = f64_func_f64;
     Table[RTLIB::COS_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::TAN_F32] = f32_func_f32;
+    Table[RTLIB::TAN_F64] = f64_func_f64;
+    Table[RTLIB::TAN_F128] = i64_i64_func_i64_i64;
     Table[RTLIB::SINCOS_F32] = func_f32_iPTR_iPTR;
     Table[RTLIB::SINCOS_F64] = func_f64_iPTR_iPTR;
     Table[RTLIB::SINCOS_F128] = func_i64_i64_iPTR_iPTR;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 472f34a4efdb47..bc2eb6dcd541c7 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -125,9 +125,10 @@ class X86AsmBackend : public MCAsmBackend {
   unsigned TargetPrefixMax = 0;
 
   MCInst PrevInst;
+  unsigned PrevInstOpcode = 0;
   MCBoundaryAlignFragment *PendingBA = nullptr;
   std::pair<MCFragment *, size_t> PrevInstPosition;
-  bool CanPadInst = false;
+  bool IsRightAfterData = false;
 
   uint8_t determinePaddingPrefix(const MCInst &Inst) const;
   bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
@@ -267,8 +268,8 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) {
 }
 
 /// Check if the instruction is a prefix.
-static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) {
-  return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags);
+static bool isPrefix(unsigned Opcode, const MCInstrInfo &MCII) {
+  return X86II::isPrefix(MCII.get(Opcode).TSFlags);
 }
 
 /// Check if the instruction is valid as the first instruction in macro fusion.
@@ -382,9 +383,9 @@ bool X86AsmBackend::allowEnhancedRelaxation() const {
 
 /// X86 has certain instructions which enable interrupts exactly one
 /// instruction *after* the instruction which stores to SS.  Return true if the
-/// given instruction has such an interrupt delay slot.
-static bool hasInterruptDelaySlot(const MCInst &Inst) {
-  switch (Inst.getOpcode()) {
+/// given instruction may have such an interrupt delay slot.
+static bool mayHaveInterruptDelaySlot(unsigned InstOpcode) {
+  switch (InstOpcode) {
   case X86::POPSS16:
   case X86::POPSS32:
   case X86::STI:
@@ -394,9 +395,9 @@ static bool hasInterruptDelaySlot(const MCInst &Inst) {
   case X86::MOV32sr:
   case X86::MOV64sr:
   case X86::MOV16sm:
-    if (Inst.getOperand(0).getReg() == X86::SS)
-      return true;
-    break;
+    // In fact, this is only the case if the first operand is SS. However, as
+    // segment moves occur extremely rarely, this is just a minor pessimization.
+    return true;
   }
   return false;
 }
@@ -406,16 +407,10 @@ static bool
 isRightAfterData(MCFragment *CurrentFragment,
                  const std::pair<MCFragment *, size_t> &PrevInstPosition) {
   MCFragment *F = CurrentFragment;
-  // Empty data fragments may be created to prevent further data being
-  // added into the previous fragment, we need to skip them since they
-  // have no contents.
-  for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode())
-    if (cast<MCDataFragment>(F)->getContents().size() != 0)
-      break;
-
   // Since data is always emitted into a DataFragment, our check strategy is
   // simple here.
   //   - If the fragment is a DataFragment
+  //     - If it's empty (section start or data after align), return false.
   //     - If it's not the fragment where the previous instruction is,
   //       returns true.
   //     - If it's the fragment holding the previous instruction but its
@@ -424,8 +419,9 @@ isRightAfterData(MCFragment *CurrentFragment,
   //     - Otherwise returns false.
   //   - If the fragment is not a DataFragment, returns false.
   if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
-    return DF != PrevInstPosition.first ||
-           DF->getContents().size() != PrevInstPosition.second;
+    return DF->getContents().size() &&
+           (DF != PrevInstPosition.first ||
+            DF->getContents().size() != PrevInstPosition.second);
 
   return false;
 }
@@ -455,22 +451,22 @@ bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
     // TLSCALL).
     return false;
 
-  if (hasInterruptDelaySlot(PrevInst))
+  if (mayHaveInterruptDelaySlot(PrevInstOpcode))
     // If this instruction follows an interrupt enabling instruction with a one
     // instruction delay, inserting a nop would change behavior.
     return false;
 
-  if (isPrefix(PrevInst, *MCII))
+  if (isPrefix(PrevInstOpcode, *MCII))
     // If this instruction follows a prefix, inserting a nop/prefix would change
     // semantic.
     return false;
 
-  if (isPrefix(Inst, *MCII))
+  if (isPrefix(Inst.getOpcode(), *MCII))
     // If this instruction is a prefix, inserting a prefix would change
     // semantic.
     return false;
 
-  if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
+  if (IsRightAfterData)
     // If this instruction follows any data, there is no clear
     // instruction boundary, inserting a nop/prefix would change semantic.
     return false;
@@ -514,19 +510,27 @@ bool X86AsmBackend::needAlign(const MCInst &Inst) const {
 /// Insert BoundaryAlignFragment before instructions to align branches.
 void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
                                          const MCInst &Inst, const MCSubtargetInfo &STI) {
-  CanPadInst = canPadInst(Inst, OS);
+  // Used by canPadInst. Done here, because in emitInstructionEnd, the current
+  // fragment will have changed.
+  IsRightAfterData =
+      isRightAfterData(OS.getCurrentFragment(), PrevInstPosition);
 
   if (!canPadBranches(OS))
     return;
 
+  // NB: PrevInst only valid if canPadBranches is true.
   if (!isMacroFused(PrevInst, Inst))
     // Macro fusion doesn't happen indeed, clear the pending.
     PendingBA = nullptr;
 
-  if (!CanPadInst)
+  // When branch padding is enabled (basically the skx102 erratum => unlikely),
+  // we call canPadInst (not cheap) twice. However, in the common case, we can
+  // avoid unnecessary calls to that, as this is otherwise only used for
+  // relaxable fragments.
+  if (!canPadInst(Inst, OS))
     return;
 
-  if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) {
+  if (PendingBA && PendingBA->getNextNode() == OS.getCurrentFragment()) {
     // Macro fusion actually happens and there is no other fragment inserted
     // after the previous instruction.
     //
@@ -557,16 +561,22 @@ void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
 }
 
 /// Set the last fragment to be aligned for the BoundaryAlignFragment.
-void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {
-  PrevInst = Inst;
+void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS,
+                                       const MCInst &Inst) {
   MCFragment *CF = OS.getCurrentFragment();
-  PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
   if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
-    F->setAllowAutoPadding(CanPadInst);
+    F->setAllowAutoPadding(canPadInst(Inst, OS));
+
+  // Update PrevInstOpcode here, canPadInst() reads that.
+  PrevInstOpcode = Inst.getOpcode();
+  PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
 
   if (!canPadBranches(OS))
     return;
 
+  // PrevInst is only needed if canPadBranches. Copying an MCInst isn't cheap.
+  PrevInst = Inst;
+
   if (!needAlign(Inst) || !PendingBA)
     return;
 
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index d6d077363f6fbb..9bd4e783f58398 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -127,7 +127,7 @@ FunctionPass *llvm::createX86FlagsCopyLoweringPass() {
 char X86FlagsCopyLoweringPass::ID = 0;
 
 void X86FlagsCopyLoweringPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -257,7 +257,7 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   TII = Subtarget->getInstrInfo();
   TRI = Subtarget->getRegisterInfo();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   PromoteRC = &X86::GR8RegClass;
 
   if (MF.empty())
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2aec14e93d082d..cad3ea4716db3e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -42793,6 +42793,19 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
            SimplifyDemandedBits(Op1, SignMask, KnownSrc, TLO, Depth + 1,
                                 AssumeSingleUse);
   }
+  case X86ISD::CMOV: {
+    KnownBits Known2;
+    if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
+                             OriginalDemandedElts, Known2, TLO, Depth + 1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(0), OriginalDemandedBits,
+                             OriginalDemandedElts, Known, TLO, Depth + 1))
+      return true;
+
+    // Only known if known in both the LHS and RHS.
+    Known = Known.intersectWith(Known2);
+    break;
+  }
   case X86ISD::BEXTR:
   case X86ISD::BEXTRI: {
     SDValue Op0 = Op.getOperand(0);
@@ -50184,12 +50197,12 @@ static SDValue combineAddOrSubToADCOrSBB(bool IsSub, const SDLoc &DL, EVT VT,
 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
 /// with CMP+{ADC, SBB}.
-static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, const SDLoc &DL,
+                                         SelectionDAG &DAG) {
   bool IsSub = N->getOpcode() == ISD::SUB;
   SDValue X = N->getOperand(0);
   SDValue Y = N->getOperand(1);
   EVT VT = N->getValueType(0);
-  SDLoc DL(N);
 
   if (SDValue ADCOrSBB = combineAddOrSubToADCOrSBB(IsSub, DL, VT, X, Y, DAG))
     return ADCOrSBB;
@@ -52718,7 +52731,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
   return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
 }
 
-static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
+static SDValue combineXorSubCTLZ(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                  const X86Subtarget &Subtarget) {
   assert((N->getOpcode() == ISD::XOR || N->getOpcode() == ISD::SUB) &&
          "Invalid opcode for combing with CTLZ");
@@ -52758,7 +52771,6 @@ static SDValue combineXorSubCTLZ(SDNode *N, SelectionDAG &DAG,
 
   if (C->getZExtValue() != uint64_t(OpCTLZ.getValueSizeInBits() - 1))
     return SDValue();
-  SDLoc DL(N);
   EVT OpVT = VT;
   SDValue Op = OpCTLZ.getOperand(0);
   if (VT == MVT::i8) {
@@ -52781,11 +52793,12 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // If this is SSE1 only convert to FXOR to avoid scalarization.
   if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
     return DAG.getBitcast(MVT::v4i32,
-                          DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
+                          DAG.getNode(X86ISD::FXOR, DL, MVT::v4f32,
                                       DAG.getBitcast(MVT::v4f32, N0),
                                       DAG.getBitcast(MVT::v4f32, N1)));
   }
@@ -52805,7 +52818,7 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, DCI, Subtarget))
     return FPLogic;
 
-  if (SDValue R = combineXorSubCTLZ(N, DAG, Subtarget))
+  if (SDValue R = combineXorSubCTLZ(N, DL, DAG, Subtarget))
     return R;
 
   if (DCI.isBeforeLegalizeOps())
@@ -52826,8 +52839,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
       N0.getOperand(0).getValueType().isVector() &&
       N0.getOperand(0).getValueType().getVectorElementType() == MVT::i1 &&
       TLI.isTypeLegal(N0.getOperand(0).getValueType()) && N0.hasOneUse()) {
-    return DAG.getBitcast(VT, DAG.getNOT(SDLoc(N), N0.getOperand(0),
-                                         N0.getOperand(0).getValueType()));
+    return DAG.getBitcast(
+        VT, DAG.getNOT(DL, N0.getOperand(0), N0.getOperand(0).getValueType()));
   }
 
   // Handle AVX512 mask widening.
@@ -52837,8 +52850,8 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
       N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.getOperand(0).isUndef() &&
       TLI.isTypeLegal(N0.getOperand(1).getValueType())) {
     return DAG.getNode(
-        ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
-        DAG.getNOT(SDLoc(N), N0.getOperand(1), N0.getOperand(1).getValueType()),
+        ISD::INSERT_SUBVECTOR, DL, VT, N0.getOperand(0),
+        DAG.getNOT(DL, N0.getOperand(1), N0.getOperand(1).getValueType()),
         N0.getOperand(2));
   }
 
@@ -52851,7 +52864,6 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
     auto *N1C = dyn_cast<ConstantSDNode>(N1);
     auto *N001C = dyn_cast<ConstantSDNode>(TruncExtSrc.getOperand(1));
     if (N1C && !N1C->isOpaque() && N001C && !N001C->isOpaque()) {
-      SDLoc DL(N);
       SDValue LHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(0), DL, VT);
       SDValue RHS = DAG.getZExtOrTrunc(TruncExtSrc.getOperand(1), DL, VT);
       return DAG.getNode(ISD::XOR, DL, VT, LHS,
@@ -55419,7 +55431,8 @@ static SDValue combineAddOfPMADDWD(SelectionDAG &DAG, SDValue N0, SDValue N1,
 /// Try to fold those constants into an 'add' instruction to reduce instruction
 /// count. We do this with CMOV rather the generic 'select' because there are
 /// earlier folds that may be used to turn select-of-constants into logic hacks.
-static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
+static SDValue pushAddIntoCmovOfConsts(SDNode *N, const SDLoc &DL,
+                                       SelectionDAG &DAG,
                                        const X86Subtarget &Subtarget) {
   // If an operand is zero, add-of-0 gets simplified away, so that's clearly
   // better because we eliminate 1-2 instructions. This transform is still
@@ -55451,7 +55464,6 @@ static SDValue pushAddIntoCmovOfConsts(SDNode *N, SelectionDAG &DAG,
     return SDValue();
 
   EVT VT = N->getValueType(0);
-  SDLoc DL(N);
   SDValue FalseOp = Cmov.getOperand(0);
   SDValue TrueOp = Cmov.getOperand(1);
 
@@ -55492,7 +55504,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   SDValue Op1 = N->getOperand(1);
   SDLoc DL(N);
 
-  if (SDValue Select = pushAddIntoCmovOfConsts(N, DAG, Subtarget))
+  if (SDValue Select = pushAddIntoCmovOfConsts(N, DL, DAG, Subtarget))
     return Select;
 
   if (SDValue MAdd = matchPMADDWD(DAG, Op0, Op1, DL, VT, Subtarget))
@@ -55550,7 +55562,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                        Op0.getOperand(0), Op0.getOperand(2));
   }
 
-  return combineAddOrSubToADCOrSBB(N, DAG);
+  return combineAddOrSubToADCOrSBB(N, DL, DAG);
 }
 
 // Try to fold (sub Y, cmovns X, -X) -> (add Y, cmovns -X, X) if the cmov
@@ -55626,6 +55638,7 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
+  SDLoc DL(N);
 
   // TODO: Add NoOpaque handling to isConstantIntBuildVectorOrConstantInt.
   auto IsNonOpaqueConstant = [&](SDValue Op) {
@@ -55645,7 +55658,6 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
   if (Op1.getOpcode() == ISD::XOR && IsNonOpaqueConstant(Op0) &&
       !isNullConstant(Op0) && IsNonOpaqueConstant(Op1.getOperand(1)) &&
       Op1->hasOneUse()) {
-    SDLoc DL(N);
     EVT VT = Op0.getValueType();
     SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT, Op1.getOperand(0),
                                  DAG.getNOT(SDLoc(Op1), Op1.getOperand(1), VT));
@@ -55676,14 +55688,14 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
     assert(!Op1->hasAnyUseOfValue(1) && "Overflow bit in use");
     SDValue ADC = DAG.getNode(X86ISD::ADC, SDLoc(Op1), Op1->getVTList(), Op0,
                               Op1.getOperand(1), Op1.getOperand(2));
-    return DAG.getNode(ISD::SUB, SDLoc(N), Op0.getValueType(), ADC.getValue(0),
+    return DAG.getNode(ISD::SUB, DL, Op0.getValueType(), ADC.getValue(0),
                        Op1.getOperand(0));
   }
 
-  if (SDValue V = combineXorSubCTLZ(N, DAG, Subtarget))
+  if (SDValue V = combineXorSubCTLZ(N, DL, DAG, Subtarget))
     return V;
 
-  if (SDValue V = combineAddOrSubToADCOrSBB(N, DAG))
+  if (SDValue V = combineAddOrSubToADCOrSBB(N, DL, DAG))
     return V;
 
   return combineSubSetcc(N, DAG);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 1f93d293bc2aab..ce1bbc8a959bfb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10324,7 +10324,8 @@ struct LDTLSCleanup : public MachineFunctionPass {
       return false;
     }
 
-    MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
+    MachineDominatorTree *DT =
+        &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
     return VisitNode(DT->getRootNode(), 0);
   }
 
@@ -10411,7 +10412,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineDominatorTreeWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
index 4dfe7556df0030..fff5d17160230e 100644
--- a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -237,7 +237,7 @@ void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.addRequired<MachineLoopInfo>();
-  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineDominanceFrontier>();
   AU.setPreservesCFG();
 }
@@ -270,7 +270,7 @@ bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
   TRI = STI->getRegisterInfo();
   LLVM_DEBUG(dbgs() << "Building gadget graph...\n");
   const auto &MLI = getAnalysis<MachineLoopInfo>();
-  const auto &MDT = getAnalysis<MachineDominatorTree>();
+  const auto &MDT = getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   const auto &MDF = getAnalysis<MachineDominanceFrontier>();
   std::unique_ptr<MachineGadgetGraph> Graph = getGadgetGraph(MF, MLI, MDT, MDF);
   LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n");
@@ -801,7 +801,7 @@ bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch(
 INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
                       "X86 LVI load hardening", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
 INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
                     "X86 LVI load hardening", false, false)
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
index 2e88e01ce7fdfa..7b57f7c23bf4da 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -13,6 +13,14 @@
 
 using namespace llvm;
 
+yaml::X86MachineFunctionInfo::X86MachineFunctionInfo(
+    const llvm::X86MachineFunctionInfo &MFI)
+    : AMXProgModel(MFI.getAMXProgModel()) {}
+
+void yaml::X86MachineFunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+  MappingTraits<X86MachineFunctionInfo>::mapping(YamlIO, *this);
+}
+
 MachineFunctionInfo *X86MachineFunctionInfo::clone(
     BumpPtrAllocator &Allocator, MachineFunction &DestMF,
     const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
@@ -20,6 +28,11 @@ MachineFunctionInfo *X86MachineFunctionInfo::clone(
   return DestMF.cloneInfo<X86MachineFunctionInfo>(*this);
 }
 
+void X86MachineFunctionInfo::initializeBaseYamlFields(
+    const yaml::X86MachineFunctionInfo &YamlMFI) {
+  AMXProgModel = YamlMFI.AMXProgModel;
+}
+
 void X86MachineFunctionInfo::anchor() { }
 
 void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 8aaa49945f9d44..af2de2e73dc368 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -16,13 +16,43 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/YAMLTraits.h"
 #include <set>
 
 namespace llvm {
 
 enum AMXProgModelEnum { None = 0, DirectReg = 1, ManagedRA = 2 };
 
+class X86MachineFunctionInfo;
+
+namespace yaml {
+template <> struct ScalarEnumerationTraits<AMXProgModelEnum> {
+  static void enumeration(IO &YamlIO, AMXProgModelEnum &Value) {
+    YamlIO.enumCase(Value, "None", AMXProgModelEnum::None);
+    YamlIO.enumCase(Value, "DirectReg", AMXProgModelEnum::DirectReg);
+    YamlIO.enumCase(Value, "ManagedRA", AMXProgModelEnum::ManagedRA);
+  }
+};
+
+struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo {
+  AMXProgModelEnum AMXProgModel;
+
+  X86MachineFunctionInfo() = default;
+  X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI);
+
+  void mappingImpl(yaml::IO &YamlIO) override;
+  ~X86MachineFunctionInfo() = default;
+};
+
+template <> struct MappingTraits<X86MachineFunctionInfo> {
+  static void mapping(IO &YamlIO, X86MachineFunctionInfo &MFI) {
+    YamlIO.mapOptional("amxProgModel", MFI.AMXProgModel);
+  }
+};
+} // end namespace yaml
+
 /// X86MachineFunctionInfo - This class is derived from MachineFunction and
 /// contains private X86 target-specific information for each MachineFunction.
 class X86MachineFunctionInfo : public MachineFunctionInfo {
@@ -160,6 +190,8 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
         const DenseMap<MachineBasicBlock *, MachineBasicBlock *> &Src2DstMBB)
       const override;
 
+  void initializeBaseYamlFields(const yaml::X86MachineFunctionInfo &YamlMFI);
+
   bool getForceFramePointer() const { return ForceFramePointer;}
   void setForceFramePointer(bool forceFP) { ForceFramePointer = forceFP; }
 
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 27542e54829bff..d4e642c7df9cf9 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -31,6 +31,8 @@
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
@@ -344,6 +346,24 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
   return I.get();
 }
 
+yaml::MachineFunctionInfo *X86TargetMachine::createDefaultFuncInfoYAML() const {
+  return new yaml::X86MachineFunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+X86TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+  const auto *MFI = MF.getInfo<X86MachineFunctionInfo>();
+  return new yaml::X86MachineFunctionInfo(*MFI);
+}
+
+bool X86TargetMachine::parseMachineFunctionInfo(
+    const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
+    SMDiagnostic &Error, SMRange &SourceRange) const {
+  const auto &YamlMFI = static_cast<const yaml::X86MachineFunctionInfo &>(MFI);
+  PFS.MF.getInfo<X86MachineFunctionInfo>()->initializeBaseYamlFields(YamlMFI);
+  return false;
+}
+
 bool X86TargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
   assert(SrcAS != DestAS && "Expected different address spaces!");
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index 4a5f20fcc01726..916445c74bb903 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -58,6 +58,14 @@ class X86TargetMachine final : public LLVMTargetMachine {
   createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
                             const TargetSubtargetInfo *STI) const override;
 
+  yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+  yaml::MachineFunctionInfo *
+  convertFuncInfoToYAML(const MachineFunction &MF) const override;
+  bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+                                PerFunctionMIParsingState &PFS,
+                                SMDiagnostic &Error,
+                                SMRange &SourceRange) const override;
+
   void registerPassBuilderCallbacks(PassBuilder &PB,
                                     bool PopulateClassToPassNames) override;
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0a23bf2516763a..74948778ccf853 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -4061,7 +4061,7 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::CTPOP,      MVT::i8,      {  1, 1, 2, 2 } }, // popcnt(zext())
   };
   static const CostKindTblEntry X64CostTbl[] = { // 64-bit targets
-    { ISD::ABS,        MVT::i64,     {  1,  2,  3,  4 } }, // SUB+CMOV
+    { ISD::ABS,        MVT::i64,     {  1,  2,  3,  3 } }, // SUB+CMOV
     { ISD::BITREVERSE, MVT::i64,     { 10, 12, 20, 22 } },
     { ISD::BSWAP,      MVT::i64,     {  1,  2,  1,  2 } },
     { ISD::CTLZ,       MVT::i64,     {  4 } }, // BSR+XOR or BSR+XOR+CMOV
@@ -4082,9 +4082,9 @@ X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     { ISD::UMULO,      MVT::i64,     {  2 } }, // mulq + seto
   };
   static const CostKindTblEntry X86CostTbl[] = { // 32 or 64-bit targets
-    { ISD::ABS,        MVT::i32,     {  1,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
-    { ISD::ABS,        MVT::i16,     {  2,  2,  3,  4 } }, // SUB+XOR+SRA or SUB+CMOV
-    { ISD::ABS,        MVT::i8,      {  2,  4,  4,  4 } }, // SUB+XOR+SRA
+    { ISD::ABS,        MVT::i32,     {  1,  2,  3,  3 } }, // SUB+XOR+SRA or SUB+CMOV
+    { ISD::ABS,        MVT::i16,     {  2,  2,  3,  3 } }, // SUB+XOR+SRA or SUB+CMOV
+    { ISD::ABS,        MVT::i8,      {  2,  4,  4,  3 } }, // SUB+XOR+SRA
     { ISD::BITREVERSE, MVT::i32,     {  9, 12, 17, 19 } },
     { ISD::BITREVERSE, MVT::i16,     {  9, 12, 17, 19 } },
     { ISD::BITREVERSE, MVT::i8,      {  7,  9, 13, 14 } },
diff --git a/llvm/lib/TargetParser/AArch64TargetParser.cpp b/llvm/lib/TargetParser/AArch64TargetParser.cpp
index d1cc306790522d..ca356ec82bf1f9 100644
--- a/llvm/lib/TargetParser/AArch64TargetParser.cpp
+++ b/llvm/lib/TargetParser/AArch64TargetParser.cpp
@@ -122,14 +122,6 @@ AArch64::parseArchExtension(StringRef ArchExt) {
   return {};
 }
 
-std::optional<AArch64::ExtensionInfo>
-AArch64::targetFeatureToExtension(StringRef TargetFeature) {
-  for (const auto &E : Extensions)
-    if (TargetFeature == E.Feature)
-      return E;
-  return {};
-}
-
 std::optional<AArch64::CpuInfo> AArch64::parseCpu(StringRef Name) {
   // Resolve aliases first.
   Name = resolveCPUAlias(Name);
@@ -221,6 +213,21 @@ void AArch64::ExtensionSet::disable(ArchExtKind E) {
       disable(Dep.Later);
 }
 
+void AArch64::ExtensionSet::toLLVMFeatureList(
+    std::vector<StringRef> &Features) const {
+  if (BaseArch && !BaseArch->ArchFeature.empty())
+    Features.push_back(BaseArch->ArchFeature);
+
+  for (const auto &E : Extensions) {
+    if (E.Feature.empty() || !Touched.test(E.ID))
+      continue;
+    if (Enabled.test(E.ID))
+      Features.push_back(E.Feature);
+    else
+      Features.push_back(E.NegFeature);
+  }
+}
+
 void AArch64::ExtensionSet::addCPUDefaults(const CpuInfo &CPU) {
   LLVM_DEBUG(llvm::dbgs() << "addCPUDefaults(" << CPU.Name << ")\n");
   BaseArch = &CPU.Arch;
@@ -240,18 +247,11 @@ void AArch64::ExtensionSet::addArchDefaults(const ArchInfo &Arch) {
       enable(E.ID);
 }
 
-bool AArch64::ExtensionSet::parseModifier(StringRef Modifier,
-                                          const bool AllowNoDashForm) {
+bool AArch64::ExtensionSet::parseModifier(StringRef Modifier) {
   LLVM_DEBUG(llvm::dbgs() << "parseModifier(" << Modifier << ")\n");
 
-  size_t NChars = 0;
-  // The "no-feat" form is allowed in the target attribute but nowhere else.
-  if (AllowNoDashForm && Modifier.starts_with("no-"))
-    NChars = 3;
-  else if (Modifier.starts_with("no"))
-    NChars = 2;
-  bool IsNegated = NChars != 0;
-  StringRef ArchExt = Modifier.drop_front(NChars);
+  bool IsNegated = Modifier.starts_with("no");
+  StringRef ArchExt = IsNegated ? Modifier.drop_front(2) : Modifier;
 
   if (auto AE = parseArchExtension(ArchExt)) {
     if (AE->Feature.empty() || AE->NegFeature.empty())
@@ -265,21 +265,6 @@ bool AArch64::ExtensionSet::parseModifier(StringRef Modifier,
   return false;
 }
 
-void AArch64::ExtensionSet::reconstructFromParsedFeatures(
-    const std::vector<std::string> &Features) {
-  assert(Touched.none() && "Bitset already initialized");
-  for (auto &F : Features) {
-    bool IsNegated = F[0] == '-';
-    if (auto AE = targetFeatureToExtension(F)) {
-      Touched.set(AE->ID);
-      if (IsNegated)
-        Enabled.reset(AE->ID);
-      else
-        Enabled.set(AE->ID);
-    }
-  }
-}
-
 const AArch64::ExtensionInfo &
 AArch64::getExtensionByID(AArch64::ArchExtKind ExtID) {
   return lookupExtensionByID(ExtID);
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index e3802380d2beea..eda0c7f5da15ab 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -748,13 +748,13 @@ unsigned llvm::X86::getFeaturePriority(ProcessorFeatures Feat) {
 #ifndef NDEBUG
   // Check that priorities are set properly in the .def file. We expect that
   // "compat" features are assigned non-duplicate consecutive priorities
-  // starting from one (1, ..., 35) and multiple zeros.
+  // starting from one (1, ..., 37) and multiple zeros.
 #define X86_FEATURE_COMPAT(ENUM, STR, PRIORITY) PRIORITY,
   unsigned Priorities[] = {
 #include "llvm/TargetParser/X86TargetParser.def"
   };
   std::array<unsigned, std::size(Priorities)> HelperList;
-  const size_t MaxPriority = 35;
+  const size_t MaxPriority = 37;
   std::iota(HelperList.begin(), HelperList.begin() + MaxPriority + 1, 0);
   for (size_t i = MaxPriority + 1; i != std::size(Priorities); ++i)
     HelperList[i] = 0;
diff --git a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
index b58b906465e569..1cbf9c957f5c2b 100644
--- a/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
+++ b/llvm/lib/Transforms/IPO/MemProfContextDisambiguation.cpp
@@ -568,13 +568,13 @@ class CallsiteContextGraph {
   /// unioning their recorded alloc types.
   uint8_t computeAllocType(DenseSet<uint32_t> &ContextIds);
 
-  /// Returns the alloction type of the intersection of the contexts of two
+  /// Returns the allocation type of the intersection of the contexts of two
   /// nodes (based on their provided context id sets), optimized for the case
   /// when Node1Ids is smaller than Node2Ids.
   uint8_t intersectAllocTypesImpl(const DenseSet<uint32_t> &Node1Ids,
                                   const DenseSet<uint32_t> &Node2Ids);
 
-  /// Returns the alloction type of the intersection of the contexts of two
+  /// Returns the allocation type of the intersection of the contexts of two
   /// nodes (based on their provided context id sets).
   uint8_t intersectAllocTypes(const DenseSet<uint32_t> &Node1Ids,
                               const DenseSet<uint32_t> &Node2Ids);
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index 7e6a8817b7a67a..92ad4c34da6e7e 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -1662,8 +1662,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           else if (OverwriteExistingWeights)
             I.setMetadata(LLVMContext::MD_prof, nullptr);
         } else if (!isa<IntrinsicInst>(&I)) {
-          setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])},
-                           /*IsExpected=*/false);
+          setBranchWeights(I, {static_cast<uint32_t>(BlockWeights[BB])});
         }
       }
     } else if (OverwriteExistingWeights || ProfileSampleBlockAccurate) {
@@ -1674,7 +1673,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
           if (cast<CallBase>(I).isIndirectCall()) {
             I.setMetadata(LLVMContext::MD_prof, nullptr);
           } else {
-            setBranchWeights(I, {uint32_t(0)}, /*IsExpected=*/false);
+            setBranchWeights(I, {uint32_t(0)});
           }
         }
       }
@@ -1757,7 +1756,7 @@ void SampleProfileLoader::generateMDProfMetadata(Function &F) {
     if (MaxWeight > 0 &&
         (!TI->extractProfTotalWeight(TempWeight) || OverwriteExistingWeights)) {
       LLVM_DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
-      setBranchWeights(*TI, Weights, /*IsExpected=*/false);
+      setBranchWeights(*TI, Weights);
       ORE->emit([&]() {
         return OptimizationRemark(DEBUG_TYPE, "PopularDest", MaxDestInst)
                << "most popular destination for conditional branches at "
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 4346a07e3a2cb2..436cdbff756699 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -2619,6 +2619,7 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
 
     // ldexp(x, zext(i1 y)) -> fmul x, (select y, 2.0, 1.0)
+    // ldexp(x, sext(i1 y)) -> fmul x, (select y, 0.5, 1.0)
     Value *ExtSrc;
     if (match(Exp, m_ZExt(m_Value(ExtSrc))) &&
         ExtSrc->getType()->getScalarSizeInBits() == 1) {
@@ -2627,6 +2628,13 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
                                ConstantFP::get(II->getType(), 1.0));
       return BinaryOperator::CreateFMulFMF(Src, Select, II);
     }
+    if (match(Exp, m_SExt(m_Value(ExtSrc))) &&
+        ExtSrc->getType()->getScalarSizeInBits() == 1) {
+      Value *Select =
+          Builder.CreateSelect(ExtSrc, ConstantFP::get(II->getType(), 0.5),
+                               ConstantFP::get(II->getType(), 1.0));
+      return BinaryOperator::CreateFMulFMF(Src, Select, II);
+    }
 
     break;
   }
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index 731104d4fcef02..0a3d8d6000cf47 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -1878,7 +1878,7 @@ void CHR::fixupBranchesAndSelects(CHRScope *Scope,
       static_cast<uint32_t>(CHRBranchBias.scale(1000)),
       static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)),
   };
-  setBranchWeights(*MergedBR, Weights, /*IsExpected=*/false);
+  setBranchWeights(*MergedBR, Weights);
   CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1]
             << "\n");
 }
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 6db76ca78b218a..23a7c6a20aecbc 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -259,8 +259,7 @@ CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
       promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
 
   if (AttachProfToDirectCall) {
-    setBranchWeights(NewInst, {static_cast<uint32_t>(Count)},
-                     /*IsExpected=*/false);
+    setBranchWeights(NewInst, {static_cast<uint32_t>(Count)});
   }
 
   using namespace ore;
diff --git a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
index aac57231ba2ed6..8a12fa19a3dedf 100644
--- a/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemProfiler.cpp
@@ -839,7 +839,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
   };
 
   // Now walk the instructions, looking up the associated profile data using
-  // dbug locations.
+  // debug locations.
   for (auto &BB : F) {
     for (auto &I : BB) {
       if (I.isDebugOrPseudoInst())
@@ -937,7 +937,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
             // Add callsite metadata for the instruction's location list so that
             // it simpler later on to identify which part of the MIB contexts
             // are from this particular instruction (including during inlining,
-            // when the callsite metdata will be updated appropriately).
+            // when the callsite metadata will be updated appropriately).
             // FIXME: can this be changed to strip out the matching stack
             // context ids from the MIB contexts and not add any callsite
             // metadata here to save space?
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index ac6d3348b3db9c..2269c2e0fffae9 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1474,8 +1474,7 @@ void PGOUseFunc::populateCoverage(IndexedInstrProfReader *PGOReader) {
     for (auto *Succ : successors(&BB))
       Weights.push_back((Coverage[Succ] || !Coverage[&BB]) ? 1 : 0);
     if (Weights.size() >= 2)
-      llvm::setBranchWeights(*BB.getTerminator(), Weights,
-                             /*IsExpected=*/false);
+      llvm::setBranchWeights(*BB.getTerminator(), Weights);
   }
 
   unsigned NumCorruptCoverage = 0;
@@ -2261,7 +2260,7 @@ void llvm::setProfMetadata(Module *M, Instruction *TI,
 
   misexpect::checkExpectAnnotations(*TI, Weights, /*IsFrontend=*/false);
 
-  setBranchWeights(*TI, Weights, /*IsExpected=*/false);
+  setBranchWeights(*TI, Weights);
   if (EmitBranchProbability) {
     std::string BrCondStr = getBranchCondString(TI);
     if (BrCondStr.empty())
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index b9583836aea065..74a8f1958dfe93 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -231,7 +231,7 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
       Weights[0] = BP.getCompl().getNumerator();
       Weights[1] = BP.getNumerator();
     }
-    setBranchWeights(*PredBr, Weights, hasBranchWeightOrigin(*PredBr));
+    setBranchWeights(*PredBr, Weights);
   }
 }
 
@@ -2618,7 +2618,7 @@ void JumpThreadingPass::updateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
       Weights.push_back(Prob.getNumerator());
 
     auto TI = BB->getTerminator();
-    setBranchWeights(*TI, Weights, hasBranchWeightOrigin(*TI));
+    setBranchWeights(*TI, Weights);
   }
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 17c5a4ee1fd0be..6f87e4d91d2c79 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -102,7 +102,7 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   misexpect::checkExpectAnnotations(SI, Weights, /*IsFrontend=*/true);
 
   SI.setCondition(ArgValue);
-  setBranchWeights(SI, Weights, /*IsExpected=*/true);
+  setBranchWeights(SI, Weights);
   return true;
 }
 
@@ -262,13 +262,11 @@ static void handlePhiDef(CallInst *Expect) {
     if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
       BI->setMetadata(LLVMContext::MD_prof,
                       MDB.createBranchWeights(LikelyBranchWeightVal,
-                                              UnlikelyBranchWeightVal,
-                                              /*IsExpected=*/true));
+                                              UnlikelyBranchWeightVal));
     else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
       BI->setMetadata(LLVMContext::MD_prof,
                       MDB.createBranchWeights(UnlikelyBranchWeightVal,
-                                              LikelyBranchWeightVal,
-                                              /*IsExpected=*/true));
+                                              LikelyBranchWeightVal));
   }
 }
 
@@ -333,12 +331,12 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   SmallVector<uint32_t, 4> ExpectedWeights;
   if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
       (Predicate == CmpInst::ICMP_EQ)) {
-    Node = MDB.createBranchWeights(
-        LikelyBranchWeightVal, UnlikelyBranchWeightVal, /*IsExpected=*/true);
+    Node =
+        MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
     ExpectedWeights = {LikelyBranchWeightVal, UnlikelyBranchWeightVal};
   } else {
-    Node = MDB.createBranchWeights(UnlikelyBranchWeightVal,
-                                   LikelyBranchWeightVal, /*IsExpected=*/true);
+    Node =
+        MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
     ExpectedWeights = {UnlikelyBranchWeightVal, LikelyBranchWeightVal};
   }
 
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index d380b8a02d90db..d3f69d09e70dcb 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -954,6 +954,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::ShadowCallStack:
       case Attribute::SanitizeAddress:
       case Attribute::SanitizeMemory:
+      case Attribute::SanitizeNumericalStability:
       case Attribute::SanitizeThread:
       case Attribute::SanitizeHWAddress:
       case Attribute::SanitizeMemTag:
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 12229123675e79..ce0f4c7668a40e 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -231,7 +231,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
           // Remove weight for this case.
           std::swap(Weights[Idx + 1], Weights.back());
           Weights.pop_back();
-          setBranchWeights(*SI, Weights, hasBranchWeightOrigin(MD));
+          setBranchWeights(*SI, Weights);
         }
         // Remove this entry.
         BasicBlock *ParentBB = SI->getParent();
diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index d517ec3d6e9f21..e2516930d251b9 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -680,7 +680,7 @@ struct WeightInfo {
 /// To avoid dealing with division rounding we can just multiple both part
 /// of weights to E and use weight as (F - I * E, E).
 static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
-  setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
+  setBranchWeights(*Term, Info.Weights);
   for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
     if (SubWeight != 0)
       // Don't set the probability of taking the edge from latch to loop header
@@ -1073,7 +1073,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   }
 
   for (const auto &[Term, Info] : Weights) {
-    setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
+    setBranchWeights(*Term, Info.Weights);
   }
 
   // Update Metadata for count of peeled off iterations.
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index 04042e71a2b82e..3d950b151cd32f 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -390,13 +390,13 @@ static void updateBranchWeights(BranchInst &PreHeaderBI, BranchInst &LoopBI,
       SuccsSwapped ? LoopBackWeight : ExitWeight1,
       SuccsSwapped ? ExitWeight1 : LoopBackWeight,
   };
-  setBranchWeights(LoopBI, LoopBIWeights, /*IsExpected=*/false);
+  setBranchWeights(LoopBI, LoopBIWeights);
   if (HasConditionalPreHeader) {
     const uint32_t PreHeaderBIWeights[] = {
         SuccsSwapped ? EnterWeight : ExitWeight0,
         SuccsSwapped ? ExitWeight0 : EnterWeight,
     };
-    setBranchWeights(PreHeaderBI, PreHeaderBIWeights, /*IsExpected=*/false);
+    setBranchWeights(PreHeaderBI, PreHeaderBIWeights);
   }
 }
 
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 107c8bb6c027fa..292739b6c5fdab 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -861,28 +861,26 @@ static bool ValuesOverlap(std::vector<ValueEqualityComparisonCase> &C1,
 
 // Set branch weights on SwitchInst. This sets the metadata if there is at
 // least one non-zero weight.
-static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights,
-                             bool IsExpected) {
+static void setBranchWeights(SwitchInst *SI, ArrayRef<uint32_t> Weights) {
   // Check that there is at least one non-zero weight. Otherwise, pass
   // nullptr to setMetadata which will erase the existing metadata.
   MDNode *N = nullptr;
   if (llvm::any_of(Weights, [](uint32_t W) { return W != 0; }))
-    N = MDBuilder(SI->getParent()->getContext())
-            .createBranchWeights(Weights, IsExpected);
+    N = MDBuilder(SI->getParent()->getContext()).createBranchWeights(Weights);
   SI->setMetadata(LLVMContext::MD_prof, N);
 }
 
 // Similar to the above, but for branch and select instructions that take
 // exactly 2 weights.
 static void setBranchWeights(Instruction *I, uint32_t TrueWeight,
-                             uint32_t FalseWeight, bool IsExpected) {
+                             uint32_t FalseWeight) {
   assert(isa<BranchInst>(I) || isa<SelectInst>(I));
   // Check that there is at least one non-zero weight. Otherwise, pass
   // nullptr to setMetadata which will erase the existing metadata.
   MDNode *N = nullptr;
   if (TrueWeight || FalseWeight)
     N = MDBuilder(I->getParent()->getContext())
-            .createBranchWeights(TrueWeight, FalseWeight, IsExpected);
+            .createBranchWeights(TrueWeight, FalseWeight);
   I->setMetadata(LLVMContext::MD_prof, N);
 }
 
@@ -1340,7 +1338,7 @@ bool SimplifyCFGOpt::PerformValueComparisonIntoPredecessorFolding(
 
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
 
-    setBranchWeights(NewSI, MDWeights, /*IsExpected=*/false);
+    setBranchWeights(NewSI, MDWeights);
   }
 
   EraseTerminatorAndDCECond(PTI);
@@ -3833,7 +3831,7 @@ static bool performBranchToCommonDestFolding(BranchInst *BI, BranchInst *PBI,
     FitWeights(NewWeights);
 
     SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(), NewWeights.end());
-    setBranchWeights(PBI, MDWeights[0], MDWeights[1], /*IsExpected=*/false);
+    setBranchWeights(PBI, MDWeights[0], MDWeights[1]);
 
     // TODO: If BB is reachable from all paths through PredBlock, then we
     // could replace PBI's branch probabilities with BI's.
@@ -4570,7 +4568,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
     // Halve the weights if any of them cannot fit in an uint32_t
     FitWeights(NewWeights);
 
-    setBranchWeights(PBI, NewWeights[0], NewWeights[1], /*IsExpected=*/false);
+    setBranchWeights(PBI, NewWeights[0], NewWeights[1]);
   }
 
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
@@ -4606,8 +4604,7 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 
         FitWeights(NewWeights);
 
-        setBranchWeights(NV, NewWeights[0], NewWeights[1],
-                         /*IsExpected=*/false);
+        setBranchWeights(NV, NewWeights[0], NewWeights[1]);
       }
     }
   }
@@ -4670,7 +4667,7 @@ bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
       // Create a conditional branch sharing the condition of the select.
       BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
       if (TrueWeight != FalseWeight)
-        setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false);
+        setBranchWeights(NewBI, TrueWeight, FalseWeight);
     }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
@@ -5620,7 +5617,7 @@ bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
         TrueWeight /= 2;
         FalseWeight /= 2;
       }
-      setBranchWeights(NewBI, TrueWeight, FalseWeight, /*IsExpected=*/false);
+      setBranchWeights(NewBI, TrueWeight, FalseWeight);
     }
   }
 
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1acecf2738b5f2..c7c19ef456c7cb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2145,7 +2145,7 @@ class GeneratedRTChecks {
 
     BranchInst &BI = *BranchInst::Create(Bypass, LoopVectorPreHeader, Cond);
     if (AddBranchWeights)
-      setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false);
+      setBranchWeights(BI, SCEVCheckBypassWeights);
     ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI);
     return SCEVCheckBlock;
   }
@@ -2173,7 +2173,7 @@ class GeneratedRTChecks {
     BranchInst &BI =
         *BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
     if (AddBranchWeights) {
-      setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
+      setBranchWeights(BI, MemCheckBypassWeights);
     }
     ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
     MemCheckBlock->getTerminator()->setDebugLoc(
@@ -2889,7 +2889,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
-    setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
+    setBranchWeights(BI, MinItersBypassWeights);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
   LoopBypassBlocks.push_back(TCCheckBlock);
 }
@@ -3128,7 +3128,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
       unsigned TripCount = UF * VF.getKnownMinValue();
       assert(TripCount > 0 && "trip count should not be zero");
       const uint32_t Weights[] = {1, TripCount - 1};
-      setBranchWeights(BI, Weights, /*IsExpected=*/false);
+      setBranchWeights(BI, Weights);
     }
   }
 
@@ -7669,7 +7669,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
   BranchInst &BI =
       *BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
   if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator()))
-    setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false);
+    setBranchWeights(BI, MinItersBypassWeights);
   ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI);
 
   return TCCheckBlock;
@@ -7826,7 +7826,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
     const uint32_t Weights[] = {EstimatedSkipCount,
                                 MainLoopStep - EstimatedSkipCount};
-    setBranchWeights(BI, Weights, /*IsExpected=*/false);
+    setBranchWeights(BI, Weights);
   }
   ReplaceInstWithInst(Insert->getTerminator(), &BI);
 
diff --git a/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll b/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll
index 4498d499745aa8..5d41228fdce186 100644
--- a/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll
+++ b/llvm/test/Analysis/CostModel/X86/abs-sizelatency.ll
@@ -14,42 +14,42 @@
 
 define void @cost_abs_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
 ; SSE-LABEL: 'cost_abs_i64'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i64'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i64'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i64'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i64'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i64'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 false)
@@ -64,56 +64,56 @@ define void @cost_abs_i64(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64>
 
 define void @cost_abs_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
 ; SSE2-LABEL: 'cost_abs_i32'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'cost_abs_i32'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_abs_i32'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i32'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i32'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i32'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i32'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i32'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 false)
@@ -128,56 +128,56 @@ define void @cost_abs_i32(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32>
 
 define void @cost_abs_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
 ; SSE2-LABEL: 'cost_abs_i16'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'cost_abs_i16'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_abs_i16'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i16'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i16'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i16'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i16'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i16'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 false)
@@ -192,56 +192,56 @@ define void @cost_abs_i16(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16
 
 define void @cost_abs_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
 ; SSE2-LABEL: 'cost_abs_i8'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'cost_abs_i8'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_abs_i8'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i8'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i8'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i8'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i8'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i8'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 false)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 false)
@@ -260,42 +260,42 @@ define void @cost_abs_i8(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a5
 
 define void @cost_abs_i64_poison(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8 x i64> %a512) {
 ; SSE-LABEL: 'cost_abs_i64_poison'
-; SSE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
+; SSE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true)
 ; SSE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i64_poison'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i64_poison'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i64_poison'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i64_poison'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i64_poison'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I64 = call i64 @llvm.abs.i64(i64 %a64, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I64 = call <2 x i64> @llvm.abs.v2i64(<2 x i64> %a128, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I64 = call <4 x i64> @llvm.abs.v4i64(<4 x i64> %a256, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I64 = call <8 x i64> @llvm.abs.v8i64(<8 x i64> %a512, i1 true)
@@ -310,56 +310,56 @@ define void @cost_abs_i64_poison(i64 %a64, <2 x i64> %a128, <4 x i64> %a256, <8
 
 define void @cost_abs_i32_poison(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16 x i32> %a512) {
 ; SSE2-LABEL: 'cost_abs_i32_poison'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'cost_abs_i32_poison'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_abs_i32_poison'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i32_poison'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i32_poison'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i32_poison'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i32_poison'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i32_poison'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I32 = call i32 @llvm.abs.i32(i32 %a32, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2I32 = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %a128, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4I32 = call <8 x i32> @llvm.abs.v8i32(<8 x i32> %a256, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I32 = call <16 x i32> @llvm.abs.v16i32(<16 x i32> %a512, i1 true)
@@ -374,56 +374,56 @@ define void @cost_abs_i32_poison(i32 %a32, <4 x i32> %a128, <8 x i32> %a256, <16
 
 define void @cost_abs_i16_poison(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <32 x i16> %a512) {
 ; SSE2-LABEL: 'cost_abs_i16_poison'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'cost_abs_i16_poison'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_abs_i16_poison'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i16_poison'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i16_poison'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i16_poison'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i16_poison'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i16_poison'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I16 = call i16 @llvm.abs.i16(i16 %a16, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8I16 = call <8 x i16> @llvm.abs.v8i16(<8 x i16> %a128, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I16 = call <16 x i16> @llvm.abs.v16i16(<16 x i16> %a256, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I16 = call <32 x i16> @llvm.abs.v32i16(<32 x i16> %a512, i1 true)
@@ -438,56 +438,56 @@ define void @cost_abs_i16_poison(i16 %a16, <8 x i16> %a128, <16 x i16> %a256, <3
 
 define void @cost_abs_i8_poison(i8 %a8, <16 x i8> %a128, <32 x i8> %a256, <64 x i8> %a512) {
 ; SSE2-LABEL: 'cost_abs_i8_poison'
-; SSE2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
 ; SSE2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSSE3-LABEL: 'cost_abs_i8_poison'
-; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; SSSE3-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
 ; SSSE3-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; SSE42-LABEL: 'cost_abs_i8_poison'
-; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; SSE42-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
 ; SSE42-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX1-LABEL: 'cost_abs_i8_poison'
-; AVX1-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; AVX1-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
 ; AVX1-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX2-LABEL: 'cost_abs_i8_poison'
-; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; AVX2-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
 ; AVX2-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512F-LABEL: 'cost_abs_i8_poison'
-; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; AVX512F-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
 ; AVX512F-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512DQ-LABEL: 'cost_abs_i8_poison'
-; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
 ; AVX512DQ-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
 ; AVX512BW-LABEL: 'cost_abs_i8_poison'
-; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
+; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %I8 = call i8 @llvm.abs.i8(i8 %a8, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16I8 = call <16 x i8> @llvm.abs.v16i8(<16 x i8> %a128, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V32I8 = call <32 x i8> @llvm.abs.v32i8(<32 x i8> %a256, i1 true)
 ; AVX512BW-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V64I8 = call <64 x i8> @llvm.abs.v64i8(<64 x i8> %a512, i1 true)
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll b/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
index a4846dade4dbd5..7d2b2a3b25d573 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/ipa-alias.ll
@@ -1,5 +1,4 @@
 ; REQUIRES: aarch64-registered-target
-; REQUIRES: shell
 
 ; Test IPA over a single combined file
 ; RUN: llvm-as %s -o %t0.bc
@@ -14,27 +13,27 @@
 ; RUN: opt -module-summary %s -o %t.summ0.bc
 ; RUN: opt -module-summary %S/Inputs/ipa-alias.ll -o %t.summ1.bc
 
-; RUN: echo > %t.res.txt \
-; RUN:  -r %t.summ0.bc,AliasCall,px \
-; RUN:  -r %t.summ0.bc,AliasToBitcastAliasWrite1, \
-; RUN:  -r %t.summ0.bc,AliasToPreemptableAliasWrite1, \
-; RUN:  -r %t.summ0.bc,AliasWrite1, \
-; RUN:  -r %t.summ0.bc,BitcastAliasCall,px \
-; RUN:  -r %t.summ0.bc,BitcastAliasWrite1, \
-; RUN:  -r %t.summ0.bc,InterposableAliasCall,px \
-; RUN:  -r %t.summ0.bc,InterposableAliasWrite1, \
-; RUN:  -r %t.summ0.bc,PreemptableAliasCall,px \
-; RUN:  -r %t.summ0.bc,PreemptableAliasWrite1, \
-; RUN:  -r %t.summ1.bc,AliasToBitcastAliasWrite1,px \
-; RUN:  -r %t.summ1.bc,AliasToPreemptableAliasWrite1,px \
-; RUN:  -r %t.summ1.bc,AliasWrite1,px \
-; RUN:  -r %t.summ1.bc,BitcastAliasWrite1,px \
-; RUN:  -r %t.summ1.bc,InterposableAliasWrite1,px \
-; RUN:  -r %t.summ1.bc,PreemptableAliasWrite1,px \
-; RUN:  -r %t.summ1.bc,Write1,px
+; DEFINE: %{res} = \
+; DEFINE:  -r %t.summ0.bc,AliasCall,px \
+; DEFINE:  -r %t.summ0.bc,AliasToBitcastAliasWrite1, \
+; DEFINE:  -r %t.summ0.bc,AliasToPreemptableAliasWrite1, \
+; DEFINE:  -r %t.summ0.bc,AliasWrite1, \
+; DEFINE:  -r %t.summ0.bc,BitcastAliasCall,px \
+; DEFINE:  -r %t.summ0.bc,BitcastAliasWrite1, \
+; DEFINE:  -r %t.summ0.bc,InterposableAliasCall,px \
+; DEFINE:  -r %t.summ0.bc,InterposableAliasWrite1, \
+; DEFINE:  -r %t.summ0.bc,PreemptableAliasCall,px \
+; DEFINE:  -r %t.summ0.bc,PreemptableAliasWrite1, \
+; DEFINE:  -r %t.summ1.bc,AliasToBitcastAliasWrite1,px \
+; DEFINE:  -r %t.summ1.bc,AliasToPreemptableAliasWrite1,px \
+; DEFINE:  -r %t.summ1.bc,AliasWrite1,px \
+; DEFINE:  -r %t.summ1.bc,BitcastAliasWrite1,px \
+; DEFINE:  -r %t.summ1.bc,InterposableAliasWrite1,px \
+; DEFINE:  -r %t.summ1.bc,PreemptableAliasWrite1,px \
+; DEFINE:  -r %t.summ1.bc,Write1,px
 
 ; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-print -stack-safety-run -save-temps -thinlto-threads 1 -O0 \
-; RUN:  $(cat %t.res.txt) \
+; RUN:  %{res} \
 ; RUN:    2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL,LTO
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll b/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll
index 8de56adc2bbd1c..0e2a08f3ae8df7 100644
--- a/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll
+++ b/llvm/test/Analysis/StackSafetyAnalysis/ipa.ll
@@ -1,5 +1,4 @@
 ; REQUIRES: aarch64-registered-target
-; REQUIRES: shell
 
 ; RUN: llvm-as %s -o %t0.bc
 ; RUN: llvm-as %S/Inputs/ipa.ll -o %t1.bc
@@ -17,78 +16,79 @@
 ; RUN: llvm-dis %t.summ0.bc -o - > %t.ids.txt
 ; RUN: llvm-dis %t.summ1.bc -o - >> %t.ids.txt
 
-; RUN: echo > %t.res.txt \
-; RUN:  -r %t.summ0.bc,ExternalCall, \
-; RUN:  -r %t.summ0.bc,f1,px \
-; RUN:  -r %t.summ0.bc,f2,px \
-; RUN:  -r %t.summ0.bc,f3,px \
-; RUN:  -r %t.summ0.bc,f4,px \
-; RUN:  -r %t.summ0.bc,f5,px \
-; RUN:  -r %t.summ0.bc,f6,px \
-; RUN:  -r %t.summ0.bc,f7,px \
-; RUN:  -r %t.summ0.bc,f8left,px \
-; RUN:  -r %t.summ0.bc,f8oobleft,px \
-; RUN:  -r %t.summ0.bc,f8oobright,px \
-; RUN:  -r %t.summ0.bc,f8right,px \
-; RUN:  -r %t.summ0.bc,InterposableCall,px \
-; RUN:  -r %t.summ0.bc,InterposableWrite1, \
-; RUN:  -r %t.summ0.bc,PreemptableCall,px \
-; RUN:  -r %t.summ0.bc,PreemptableWrite1, \
-; RUN:  -r %t.summ0.bc,PrivateCall,px \
-; RUN:  -r %t.summ0.bc,Rec2, \
-; RUN:  -r %t.summ0.bc,RecursiveNoOffset, \
-; RUN:  -r %t.summ0.bc,RecursiveWithOffset, \
-; RUN:  -r %t.summ0.bc,ReturnDependent, \
-; RUN:  -r %t.summ0.bc,TestCrossModuleConflict,px \
-; RUN:  -r %t.summ0.bc,TestCrossModuleOnce,px \
-; RUN:  -r %t.summ0.bc,TestCrossModuleTwice,px \
-; RUN:  -r %t.summ0.bc,TestCrossModuleWeak,px \
-; RUN:  -r %t.summ0.bc,TestRecursiveNoOffset,px \
-; RUN:  -r %t.summ0.bc,TestRecursiveWithOffset,px \
-; RUN:  -r %t.summ0.bc,TestUpdateArg,px \
-; RUN:  -r %t.summ0.bc,TwoArguments,px \
-; RUN:  -r %t.summ0.bc,TwoArgumentsOOBBoth,px \
-; RUN:  -r %t.summ0.bc,TwoArgumentsOOBOne,px \
-; RUN:  -r %t.summ0.bc,TwoArgumentsOOBOther,px \
-; RUN:  -r %t.summ0.bc,Weak,x \
-; RUN:  -r %t.summ0.bc,Write1, \
-; RUN:  -r %t.summ0.bc,Write1DiffModule,x \
-; RUN:  -r %t.summ0.bc,Write1Module0,px \
-; RUN:  -r %t.summ0.bc,Write1Private,x \
-; RUN:  -r %t.summ0.bc,Write1SameModule,x \
-; RUN:  -r %t.summ0.bc,Write1Weak,x \
-; RUN:  -r %t.summ0.bc,Write4_2, \
-; RUN:  -r %t.summ0.bc,Write4, \
-; RUN:  -r %t.summ0.bc,Write8, \
-; RUN:  -r %t.summ0.bc,WriteAndReturn8, \
-; RUN:  -r %t.summ1.bc,ExternalCall,px \
-; RUN:  -r %t.summ1.bc,InterposableWrite1,px \
-; RUN:  -r %t.summ1.bc,PreemptableWrite1,px \
-; RUN:  -r %t.summ1.bc,Rec0,px \
-; RUN:  -r %t.summ1.bc,Rec1,px \
-; RUN:  -r %t.summ1.bc,Rec2,px \
-; RUN:  -r %t.summ1.bc,RecursiveNoOffset,px \
-; RUN:  -r %t.summ1.bc,RecursiveWithOffset,px \
-; RUN:  -r %t.summ1.bc,ReturnAlloca,px \
-; RUN:  -r %t.summ1.bc,ReturnDependent,px \
-; RUN:  -r %t.summ1.bc,Weak,x \
-; RUN:  -r %t.summ1.bc,Write1,px \
-; RUN:  -r %t.summ1.bc,Write1DiffModule,px \
-; RUN:  -r %t.summ1.bc,Write1Module0,x \
-; RUN:  -r %t.summ1.bc,Write1Private,px \
-; RUN:  -r %t.summ1.bc,Write1SameModule,px \
-; RUN:  -r %t.summ1.bc,Write1Weak,px \
-; RUN:  -r %t.summ1.bc,Write4_2,px \
-; RUN:  -r %t.summ1.bc,Write4,px \
-; RUN:  -r %t.summ1.bc,Write8,px \
-; RUN:  -r %t.summ1.bc,WriteAndReturn8,px
+; DEFINE: %{res} = \
+; DEFINE:  -r %t.summ0.bc,ExternalCall, \
+; DEFINE:  -r %t.summ0.bc,f1,px \
+; DEFINE:  -r %t.summ0.bc,f2,px \
+; DEFINE:  -r %t.summ0.bc,f3,px \
+; DEFINE:  -r %t.summ0.bc,f4,px \
+; DEFINE:  -r %t.summ0.bc,f5,px \
+; DEFINE:  -r %t.summ0.bc,f6,px \
+; DEFINE:  -r %t.summ0.bc,f7,px \
+; DEFINE:  -r %t.summ0.bc,f8left,px \
+; DEFINE:  -r %t.summ0.bc,f8oobleft,px \
+; DEFINE:  -r %t.summ0.bc,f8oobright,px \
+; DEFINE:  -r %t.summ0.bc,f8right,px \
+; DEFINE:  -r %t.summ0.bc,InterposableCall,px \
+; DEFINE:  -r %t.summ0.bc,InterposableWrite1, \
+; DEFINE:  -r %t.summ0.bc,PreemptableCall,px \
+; DEFINE:  -r %t.summ0.bc,PreemptableWrite1, \
+; DEFINE:  -r %t.summ0.bc,PrivateCall,px \
+; DEFINE:  -r %t.summ0.bc,Rec2, \
+; DEFINE:  -r %t.summ0.bc,RecursiveNoOffset, \
+; DEFINE:  -r %t.summ0.bc,RecursiveWithOffset, \
+; DEFINE:  -r %t.summ0.bc,ReturnDependent, \
+; DEFINE:  -r %t.summ0.bc,TestCrossModuleConflict,px \
+; DEFINE:  -r %t.summ0.bc,TestCrossModuleOnce,px \
+; DEFINE:  -r %t.summ0.bc,TestCrossModuleTwice,px \
+; DEFINE:  -r %t.summ0.bc,TestCrossModuleWeak,px \
+; DEFINE:  -r %t.summ0.bc,TestRecursiveNoOffset,px \
+; DEFINE:  -r %t.summ0.bc,TestRecursiveWithOffset,px \
+; DEFINE:  -r %t.summ0.bc,TestUpdateArg,px \
+; DEFINE:  -r %t.summ0.bc,TwoArguments,px \
+; DEFINE:  -r %t.summ0.bc,TwoArgumentsOOBBoth,px \
+; DEFINE:  -r %t.summ0.bc,TwoArgumentsOOBOne,px \
+; DEFINE:  -r %t.summ0.bc,TwoArgumentsOOBOther,px \
+; DEFINE:  -r %t.summ0.bc,Weak,x \
+; DEFINE:  -r %t.summ0.bc,Write1, \
+; DEFINE:  -r %t.summ0.bc,Write1DiffModule,x \
+; DEFINE:  -r %t.summ0.bc,Write1Module0,px \
+; DEFINE:  -r %t.summ0.bc,Write1Private,x \
+; DEFINE:  -r %t.summ0.bc,Write1SameModule,x \
+; DEFINE:  -r %t.summ0.bc,Write1Weak,x \
+; DEFINE:  -r %t.summ0.bc,Write4_2, \
+; DEFINE:  -r %t.summ0.bc,Write4, \
+; DEFINE:  -r %t.summ0.bc,Write8, \
+; DEFINE:  -r %t.summ0.bc,WriteAndReturn8, \
+; DEFINE:  -r %t.summ1.bc,ExternalCall,px \
+; DEFINE:  -r %t.summ1.bc,InterposableWrite1,px \
+; DEFINE:  -r %t.summ1.bc,PreemptableWrite1,px \
+; DEFINE:  -r %t.summ1.bc,Rec0,px \
+; DEFINE:  -r %t.summ1.bc,Rec1,px \
+; DEFINE:  -r %t.summ1.bc,Rec2,px \
+; DEFINE:  -r %t.summ1.bc,RecursiveNoOffset,px \
+; DEFINE:  -r %t.summ1.bc,RecursiveWithOffset,px \
+; DEFINE:  -r %t.summ1.bc,ReturnAlloca,px \
+; DEFINE:  -r %t.summ1.bc,ReturnDependent,px \
+; DEFINE:  -r %t.summ1.bc,Weak,x \
+; DEFINE:  -r %t.summ1.bc,Write1,px \
+; DEFINE:  -r %t.summ1.bc,Write1DiffModule,px \
+; DEFINE:  -r %t.summ1.bc,Write1Module0,x \
+; DEFINE:  -r %t.summ1.bc,Write1Private,px \
+; DEFINE:  -r %t.summ1.bc,Write1SameModule,px \
+; DEFINE:  -r %t.summ1.bc,Write1Weak,px \
+; DEFINE:  -r %t.summ1.bc,Write4_2,px \
+; DEFINE:  -r %t.summ1.bc,Write4,px \
+; DEFINE:  -r %t.summ1.bc,Write8,px \
+; DEFINE:  -r %t.summ1.bc,WriteAndReturn8,px
 
 ; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-print -stack-safety-run -save-temps -thinlto-threads 1 -O0 \
-; RUN:  $(cat %t.res.txt) \
+; RUN:  %{res} \
 ; RUN:    2>&1 | FileCheck %s --check-prefixes=CHECK,GLOBAL,LTO
 
-; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-run -thinlto-distributed-indexes -thinlto-threads 1 -O0 $(cat %t.res.txt)
-; RUN: (cat %t.ids.txt ; llvm-dis %t.summ1.bc.thinlto.bc -o -) | FileCheck --check-prefixes=INDEX %s
+; RUN: llvm-lto2 run %t.summ0.bc %t.summ1.bc -o %t.lto -stack-safety-run -thinlto-distributed-indexes -thinlto-threads 1 -O0 %{res}
+; RUN: llvm-dis %t.summ1.bc.thinlto.bc -o - >> %t.ids.txt
+; RUN: FileCheck --check-prefixes=INDEX %s < %t.ids.txt
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux"
diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll
index 6546d1a275c99f..613630e1a2b4d2 100644
--- a/llvm/test/Assembler/fp-intrinsics-attr.ll
+++ b/llvm/test/Assembler/fp-intrinsics-attr.ll
@@ -85,6 +85,11 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp {
                                                metadata !"round.dynamic",
                                                metadata !"fpexcept.strict")
 
+  %tan = call double @llvm.experimental.constrained.tan.f64(
+                                               double %a,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+
   %pow = call double @llvm.experimental.constrained.pow.f64(
                                                double %a, double %b,
                                                metadata !"round.dynamic",
@@ -244,6 +249,9 @@ declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.cos.f64({{.*}}) #[[ATTR1]]
 
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+; CHECK: @llvm.experimental.constrained.tan.f64({{.*}}) #[[ATTR1]]
+
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.pow.f64({{.*}}) #[[ATTR1]]
 
diff --git a/llvm/test/Bitcode/compatibility.ll b/llvm/test/Bitcode/compatibility.ll
index 2a846e036924c7..e437c37d8d1c87 100644
--- a/llvm/test/Bitcode/compatibility.ll
+++ b/llvm/test/Bitcode/compatibility.ll
@@ -1564,7 +1564,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #51
+  ; CHECK: call void @f.nobuiltin() #52
 
   call fastcc noalias ptr @f.noalias() noinline
   ; CHECK: call fastcc noalias ptr @f.noalias() #12
@@ -1988,6 +1988,8 @@ declare void @f.nosanitize_bounds() nosanitize_bounds
 declare void @f.allockind() allockind("alloc,uninitialized")
 ; CHECK: declare void @f.allockind() #50
 
+declare void @f.sanitize_numerical_stability() sanitize_numerical_stability
+; CHECK: declare void @f.sanitize_numerical_stability() #51
 
 ; CHECK: declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
 declare nofpclass(snan) float @nofpclass_snan(float nofpclass(snan))
@@ -2110,7 +2112,8 @@ define float @nofpclass_callsites(float %arg) {
 ; CHECK: attributes #48 = { allocsize(1,0) }
 ; CHECK: attributes #49 = { nosanitize_bounds }
 ; CHECK: attributes #50 = { allockind("alloc,uninitialized") }
-; CHECK: attributes #51 = { builtin }
+; CHECK: attributes #51 = { sanitize_numerical_stability }
+; CHECK: attributes #52 = { builtin }
 
 ;; Metadata
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-redzone.ll b/llvm/test/CodeGen/AArch64/arm64-redzone.ll
index fe30a1a98521e1..d001bc2a8dbe4e 100644
--- a/llvm/test/CodeGen/AArch64/arm64-redzone.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-redzone.ll
@@ -16,3 +16,16 @@ define i32 @foo(i32 %a, i32 %b) nounwind ssp {
   %tmp2 = load i32, ptr %x, align 4
   ret i32 %tmp2
 }
+
+; We disable red-zone if NEON is available because copies of Q-regs
+; require a spill/fill and dynamic allocation. But we only need to do
+; this when FP registers are enabled.
+define void @bar(fp128 %f) "target-features"="-fp-armv8" {
+; CHECK-LABEL: bar:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x0, x1, [sp, #-16]
+; CHECK-NEXT: ret
+  %ptr = alloca fp128
+  store fp128 %f, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
index 48062c9a54b5d3..b09ed8d3eb764e 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-fp16.ll
@@ -338,6 +338,21 @@ define half @cos_f16(half %x) #0 {
   ret half %val
 }
 
+define half @tan_f16(half %x) #0 {
+; CHECK-LABEL: tan_f16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    bl tanf
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %val = call half @llvm.experimental.constrained.tan.f16(half %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret half %val
+}
+
 define half @pow_f16(half %x, half %y) #0 {
 ; CHECK-LABEL: pow_f16:
 ; CHECK:       // %bb.0:
@@ -1147,6 +1162,7 @@ declare half @llvm.experimental.constrained.sqrt.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.powi.f16(half, i32, metadata, metadata)
 declare half @llvm.experimental.constrained.sin.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.cos.f16(half, metadata, metadata)
+declare half @llvm.experimental.constrained.tan.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.pow.f16(half, half, metadata, metadata)
 declare half @llvm.experimental.constrained.log.f16(half, metadata, metadata)
 declare half @llvm.experimental.constrained.log10.f16(half, metadata, metadata)
diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
index 685efbb7cad431..67d0b63f4076f6 100644
--- a/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/fp-intrinsics.ll
@@ -146,6 +146,13 @@ define float @cos_f32(float %x) #0 {
   ret float %val
 }
 
+; CHECK-LABEL: tan_f32:
+; CHECK: bl tanf
+define float @tan_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.tan.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
 ; CHECK-LABEL: pow_f32:
 ; CHECK: bl powf
 define float @pow_f32(float %x, float %y) #0 {
@@ -630,6 +637,13 @@ define double @cos_f64(double %x) #0 {
   ret double %val
 }
 
+; CHECK-LABEL: tan_f64:
+; CHECK: bl tan
+define double @tan_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.tan.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
 ; CHECK-LABEL: pow_f64:
 ; CHECK: bl pow
 define double @pow_f64(double %x, double %y) #0 {
@@ -1114,6 +1128,13 @@ define fp128 @cos_f128(fp128 %x) #0 {
   ret fp128 %val
 }
 
+; CHECK-LABEL: tan_f128:
+; CHECK: bl tanl
+define fp128 @tan_f128(fp128 %x) #0 {
+  %val = call fp128 @llvm.experimental.constrained.tan.f128(fp128 %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret fp128 %val
+}
+
 ; CHECK-LABEL: pow_f128:
 ; CHECK: bl powl
 define fp128 @pow_f128(fp128 %x, fp128 %y) #0 {
@@ -1491,6 +1512,13 @@ define <1 x double> @cos_v1f64(<1 x double> %x, <1 x double> %y) #0 {
   ret <1 x double> %val
 }
 
+; CHECK-LABEL: tan_v1f64:
+; CHECK: bl tan
+define <1 x double> @tan_v1f64(<1 x double> %x, <1 x double> %y) #0 {
+  %val = call <1 x double> @llvm.experimental.constrained.tan.v1f64(<1 x double> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret <1 x double> %val
+}
+
 ; CHECK-LABEL: pow_v1f64:
 ; CHECK: bl pow
 define <1 x double> @pow_v1f64(<1 x double> %x, <1 x double> %y) #0 {
@@ -1555,6 +1583,7 @@ declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, metadata)
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata)
@@ -1599,6 +1628,7 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadat
 declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
 declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
@@ -1643,6 +1673,7 @@ declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata
 declare fp128 @llvm.experimental.constrained.powi.f128(fp128, i32, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.cos.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.pow.f128(fp128, fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.log.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.log10.f128(fp128, metadata, metadata)
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
new file mode 100644
index 00000000000000..b7304b957a0013
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-reloc.ll
@@ -0,0 +1,176 @@
+; RUN: rm -rf %t && split-file %s %t && cd %t
+
+;--- ok.ll
+
+; RUN: llc < ok.ll -mtriple arm64e-apple-darwin \
+; RUN:   | FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ELF
+
+; RUN: llc < ok.ll -mtriple arm64e-apple-darwin \
+; RUN:   -global-isel -verify-machineinstrs -global-isel-abort=1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-MACHO
+; RUN: llc < ok.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel -verify-machineinstrs -global-isel-abort=1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ELF
+
+@g = external global i32
+
+@g_weak = extern_weak global i32
+
+@g_strong_def = constant i32 42
+
+; CHECK-ELF-LABEL:     .globl g.ref.ia.0
+; CHECK-ELF-NEXT:      .p2align 4
+; CHECK-ELF-NEXT:    g.ref.ia.0:
+; CHECK-ELF-NEXT:      .xword 5
+; CHECK-ELF-NEXT:      .xword g@AUTH(ia,0)
+; CHECK-ELF-NEXT:      .xword 6
+
+; CHECK-MACHO-LABEL:   .section __DATA,__const
+; CHECK-MACHO-NEXT:    .globl _g.ref.ia.0
+; CHECK-MACHO-NEXT:    .p2align 4
+; CHECK-MACHO-NEXT:  _g.ref.ia.0:
+; CHECK-MACHO-NEXT:    .quad 5
+; CHECK-MACHO-NEXT:    .quad _g@AUTH(ia,0)
+; CHECK-MACHO-NEXT:    .quad 6
+
+@g.ref.ia.0 = constant { i64, ptr, i64 } { i64 5, ptr ptrauth (ptr @g, i32 0), i64 6 }
+
+; CHECK-ELF-LABEL:     .globl g.ref.ia.42
+; CHECK-ELF-NEXT:      .p2align 3
+; CHECK-ELF-NEXT:    g.ref.ia.42:
+; CHECK-ELF-NEXT:      .xword g@AUTH(ia,42)
+
+; CHECK-MACHO-LABEL:   .globl _g.ref.ia.42
+; CHECK-MACHO-NEXT:    .p2align 3
+; CHECK-MACHO-NEXT:  _g.ref.ia.42:
+; CHECK-MACHO-NEXT:    .quad _g@AUTH(ia,42)
+
+@g.ref.ia.42 = constant ptr ptrauth (ptr @g, i32 0, i64 42)
+
+; CHECK-ELF-LABEL:     .globl g.ref.ib.0
+; CHECK-ELF-NEXT:      .p2align 4
+; CHECK-ELF-NEXT:    g.ref.ib.0:
+; CHECK-ELF-NEXT:      .xword 5
+; CHECK-ELF-NEXT:      .xword g@AUTH(ib,0)
+; CHECK-ELF-NEXT:      .xword 6
+
+; CHECK-MACHO-LABEL:   .globl _g.ref.ib.0
+; CHECK-MACHO-NEXT:    .p2align 4
+; CHECK-MACHO-NEXT:  _g.ref.ib.0:
+; CHECK-MACHO-NEXT:    .quad 5
+; CHECK-MACHO-NEXT:    .quad _g@AUTH(ib,0)
+; CHECK-MACHO-NEXT:    .quad 6
+
+@g.ref.ib.0 = constant { i64, ptr, i64 } { i64 5, ptr ptrauth (ptr @g, i32 1, i64 0), i64 6 }
+
+; CHECK-ELF-LABEL:     .globl g.ref.da.42.addr
+; CHECK-ELF-NEXT:      .p2align 3
+; CHECK-ELF-NEXT:    g.ref.da.42.addr:
+; CHECK-ELF-NEXT:      .xword g@AUTH(da,42,addr)
+
+; CHECK-MACHO-LABEL:   .globl _g.ref.da.42.addr
+; CHECK-MACHO-NEXT:    .p2align 3
+; CHECK-MACHO-NEXT:  _g.ref.da.42.addr:
+; CHECK-MACHO-NEXT:    .quad _g@AUTH(da,42,addr)
+
+@g.ref.da.42.addr = constant ptr ptrauth (ptr @g, i32 2, i64 42, ptr @g.ref.da.42.addr)
+
+; CHECK-ELF-LABEL:     .globl g.offset.ref.da.0
+; CHECK-ELF-NEXT:      .p2align 3
+; CHECK-ELF-NEXT:    g.offset.ref.da.0:
+; CHECK-ELF-NEXT:      .xword (g+16)@AUTH(da,0)
+
+; CHECK-MACHO-LABEL:   .globl _g.offset.ref.da.0
+; CHECK-MACHO-NEXT:    .p2align 3
+; CHECK-MACHO-NEXT:  _g.offset.ref.da.0:
+; CHECK-MACHO-NEXT:    .quad (_g+16)@AUTH(da,0)
+
+@g.offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2)
+
+; CHECK-ELF-LABEL:     .globl g.big_offset.ref.da.0
+; CHECK-ELF-NEXT:      .p2align 3
+; CHECK-ELF-NEXT:    g.big_offset.ref.da.0:
+; CHECK-ELF-NEXT:      .xword (g+2147549185)@AUTH(da,0)
+
+; CHECK-MACHO-LABEL:   .globl _g.big_offset.ref.da.0
+; CHECK-MACHO-NEXT:    .p2align 3
+; CHECK-MACHO-NEXT:  _g.big_offset.ref.da.0:
+; CHECK-MACHO-NEXT:    .quad (_g+2147549185)@AUTH(da,0)
+
+@g.big_offset.ref.da.0 = constant ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 add (i64 2147483648, i64 65537)), i32 2)
+
+; CHECK-ELF-LABEL:     .globl g.weird_ref.da.0
+; CHECK-ELF-NEXT:      .p2align 3
+; CHECK-ELF-NEXT:    g.weird_ref.da.0:
+; CHECK-ELF-NEXT:      .xword (g+16)@AUTH(da,0)
+
+; CHECK-MACHO-LABEL:   .globl _g.weird_ref.da.0
+; CHECK-MACHO-NEXT:    .p2align 3
+; CHECK-MACHO-NEXT:  _g.weird_ref.da.0:
+; CHECK-MACHO-NEXT:    .quad (_g+16)@AUTH(da,0)
+
+@g.weird_ref.da.0 = constant i64 ptrtoint (ptr inttoptr (i64 ptrtoint (ptr ptrauth (i8* getelementptr (i8, ptr @g, i64 16), i32 2) to i64) to ptr) to i64)
+
+; CHECK-ELF-LABEL:     .globl g_weak.ref.ia.42
+; CHECK-ELF-NEXT:      .p2align 3
+; CHECK-ELF-NEXT:    g_weak.ref.ia.42:
+; CHECK-ELF-NEXT:      .xword g_weak@AUTH(ia,42)
+
+; CHECK-MACHO-LABEL:   .globl _g_weak.ref.ia.42
+; CHECK-MACHO-NEXT:    .p2align 3
+; CHECK-MACHO-NEXT:  _g_weak.ref.ia.42:
+; CHECK-MACHO-NEXT:    .quad _g_weak@AUTH(ia,42)
+
+@g_weak.ref.ia.42 = constant ptr ptrauth (ptr @g_weak, i32 0, i64 42)
+
+; CHECK-ELF-LABEL:     .globl g_strong_def.ref.da.0
+; CHECK-ELF-NEXT:      .p2align 3
+; CHECK-ELF-NEXT:    g_strong_def.ref.da.0:
+; CHECK-ELF-NEXT:      .xword g_strong_def@AUTH(da,0)
+
+; CHECK-MACHO-LABEL:   .globl _g_strong_def.ref.da.0
+; CHECK-MACHO-NEXT:    .p2align 3
+; CHECK-MACHO-NEXT:  _g_strong_def.ref.da.0:
+; CHECK-MACHO-NEXT:    .quad _g_strong_def@AUTH(da,0)
+
+@g_strong_def.ref.da.0 = constant ptr ptrauth (ptr @g_strong_def, i32 2)
+
+;--- err-key.ll
+
+; RUN: not --crash llc < err-key.ll -mtriple arm64e-apple-darwin 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-KEY
+; RUN: not --crash llc < err-key.ll -mtriple aarch64-elf -mattr=+pauth 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-KEY
+
+; RUN: not --crash llc < err-key.ll -mtriple arm64e-apple-darwin \
+; RUN:   -global-isel -verify-machineinstrs -global-isel-abort=1  2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-KEY
+; RUN: not --crash llc < err-key.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel -verify-machineinstrs -global-isel-abort=1 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-KEY
+
+; CHECK-ERR-KEY: LLVM ERROR: AArch64 PAC Key ID '4' out of range [0, 3]
+
+@g = external global i32
+@g.ref.4.0 = constant ptr ptrauth (ptr @g, i32 4, i64 0)
+
+;--- err-disc.ll
+
+; RUN: not --crash llc < err-disc.ll -mtriple arm64e-apple-darwin 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-DISC
+; RUN: not --crash llc < err-disc.ll -mtriple aarch64-elf -mattr=+pauth 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-DISC
+
+; RUN: not --crash llc < err-disc.ll -mtriple arm64e-apple-darwin \
+; RUN:   -global-isel -verify-machineinstrs -global-isel-abort=1  2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-DISC
+; RUN: not --crash llc < err-disc.ll -mtriple aarch64-elf -mattr=+pauth \
+; RUN:   -global-isel -verify-machineinstrs -global-isel-abort=1 2>&1 \
+; RUN:   | FileCheck %s --check-prefix=CHECK-ERR-DISC
+
+; CHECK-ERR-DISC: LLVM ERROR: AArch64 PAC Discriminator '65536' out of range [0, 0xFFFF]
+
+@g = external global i32
+@g.ref.ia.65536 = constant ptr ptrauth (ptr @g, i32 0, i64 65536)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index fab94875516973..93c30a6a01e002 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -209,4 +209,165 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
   ret <2 x i16> %ret
 }
 
+define <2 x half> @local_atomic_fadd_ret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) {
+; GFX940-LABEL: local_atomic_fadd_ret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2 offset:65532
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383
+  %result = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst
+  ret <2 x half> %result
+}
+
+define void @local_atomic_fadd_noret_v2f16_offset(ptr addrspace(3) %ptr, <2 x half> %val) {
+; GFX940-LABEL: local_atomic_fadd_noret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v2, v0 offset:65532
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3 offset:65532
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(3) %ptr, i32 16383
+  %unused = atomicrmw fadd ptr addrspace(3) %gep, <2 x half> %val seq_cst
+  ret void
+}
+
+define <2 x half> @global_atomic_fadd_ret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX940-LABEL: global_atomic_fadd_ret_v2f16_agent_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
+}
+
+define void @global_atomic_fadd_noret_v2f16_agent_offset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX940-LABEL: global_atomic_fadd_noret_v2f16_agent_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
+  %unused = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x half> @flat_atomic_fadd_ret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) {
+; GFX940-LABEL: flat_atomic_fadd_ret_v2f16_agent_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1] offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr %ptr, i32 256
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
+}
+
+define void @flat_atomic_fadd_noret_v2f16_agent_offset(ptr %ptr, <2 x half> %val) {
+; GFX940-LABEL: flat_atomic_fadd_noret_v2f16_agent_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v5, v[0:1] offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr %ptr, i32 256
+  %unused = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
 attributes #0 = { "denormal-fp-math-f32"="ieee,ieee" }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
index 8262cfd34823ff..5724cf471bae36 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-atomicrmw.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=fiji -O0 -stop-after=irtranslator -o - %s | FileCheck %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx940 -O0 -stop-after=irtranslator -o - %s | FileCheck %s
 
 define float @test_atomicrmw_fadd(ptr addrspace(3) %addr) {
   ; CHECK-LABEL: name: test_atomicrmw_fadd
@@ -34,14 +34,14 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) {
   ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[LOAD]](s32), %bb.1, %13(s32), %bb.2
   ; CHECK-NEXT:   [[FSUB:%[0-9]+]]:_(s32) = G_FSUB [[PHI1]], [[C]]
   ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[PHI1]], [[FSUB]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3)
-  ; CHECK-NEXT:   [[INTRINSIC:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
-  ; CHECK-NEXT:   [[INTRINSIC_W_SIDE_EFFECTS:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INTRINSIC]](s64)
-  ; CHECK-NEXT:   G_BRCOND [[INTRINSIC_W_SIDE_EFFECTS]](s1), %bb.3
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
+  ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64)
+  ; CHECK-NEXT:   G_BRCOND [[INT1]](s1), %bb.3
   ; CHECK-NEXT:   G_BR %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.3.atomicrmw.end:
   ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32), %bb.2
-  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INTRINSIC]](s64), %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2
   ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
   ; CHECK-NEXT:   $vgpr0 = COPY [[PHI2]](s32)
   ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
@@ -49,5 +49,157 @@ define float @test_atomicrmw_fsub(ptr addrspace(3) %addr) {
   ret float %oldval
 }
 
+define <2 x half> @test_atomicrmw_fadd_vector(ptr addrspace(3) %addr) {
+  ; CHECK-LABEL: name: test_atomicrmw_fadd_vector
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.atomicrmw.start:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[FADD:%[0-9]+]]:_(<2 x s16>) = G_FADD [[PHI1]], [[BUILD_VECTOR]]
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FADD]](<2 x s16>)
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>)
+  ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
+  ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64)
+  ; CHECK-NEXT:   G_BRCOND [[INT1]](s1), %bb.3
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.atomicrmw.end:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[PHI2]](<2 x s16>)
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  %oldval = atomicrmw fadd ptr addrspace(3) %addr, <2 x half> <half 1.0, half 1.0> seq_cst
+  ret <2 x half> %oldval
+}
+
+define <2 x half> @test_atomicrmw_fsub_vector(ptr addrspace(3) %addr) {
+  ; CHECK-LABEL: name: test_atomicrmw_fsub_vector
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.atomicrmw.start:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[FSUB:%[0-9]+]]:_(<2 x s16>) = G_FSUB [[PHI1]], [[BUILD_VECTOR]]
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FSUB]](<2 x s16>)
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>)
+  ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
+  ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64)
+  ; CHECK-NEXT:   G_BRCOND [[INT1]](s1), %bb.3
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.atomicrmw.end:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[PHI2]](<2 x s16>)
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  %oldval = atomicrmw fsub ptr addrspace(3) %addr, <2 x half> <half 1.0, half 1.0> seq_cst
+  ret <2 x half> %oldval
+}
+
+define <2 x half> @test_atomicrmw_fmin_vector(ptr addrspace(3) %addr) {
+  ; CHECK-LABEL: name: test_atomicrmw_fmin_vector
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.atomicrmw.start:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[FMINNUM:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM [[PHI1]], [[BUILD_VECTOR]]
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM]](<2 x s16>)
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>)
+  ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
+  ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64)
+  ; CHECK-NEXT:   G_BRCOND [[INT1]](s1), %bb.3
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.atomicrmw.end:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[PHI2]](<2 x s16>)
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  %oldval = atomicrmw fmin ptr addrspace(3) %addr, <2 x half> <half 1.0, half 1.0> seq_cst
+  ret <2 x half> %oldval
+}
+
+define <2 x half> @test_atomicrmw_fmax_vector(ptr addrspace(3) %addr) {
+  ; CHECK-LABEL: name: test_atomicrmw_fmax_vector
+  ; CHECK: bb.1 (%ir-block.0):
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT:   liveins: $vgpr0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
+  ; CHECK-NEXT:   [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
+  ; CHECK-NEXT:   [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[C]](s16), [[C]](s16)
+  ; CHECK-NEXT:   [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+  ; CHECK-NEXT:   [[LOAD:%[0-9]+]]:_(<2 x s16>) = G_LOAD [[COPY]](p3) :: (load (<2 x s16>) from %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2.atomicrmw.start:
+  ; CHECK-NEXT:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[PHI:%[0-9]+]]:_(s64) = G_PHI %19(s64), %bb.2, [[C1]](s64), %bb.1
+  ; CHECK-NEXT:   [[PHI1:%[0-9]+]]:_(<2 x s16>) = G_PHI [[LOAD]](<2 x s16>), %bb.1, %18(<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[FMAXNUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM [[PHI1]], [[BUILD_VECTOR]]
+  ; CHECK-NEXT:   [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM]](<2 x s16>)
+  ; CHECK-NEXT:   [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[PHI1]](<2 x s16>)
+  ; CHECK-NEXT:   [[ATOMIC_CMPXCHG_WITH_SUCCESS:%[0-9]+]]:_(s32), [[ATOMIC_CMPXCHG_WITH_SUCCESS1:%[0-9]+]]:_(s1) = G_ATOMIC_CMPXCHG_WITH_SUCCESS [[COPY]](p3), [[BITCAST1]], [[BITCAST]] :: (load store seq_cst seq_cst (s32) on %ir.addr, addrspace 3)
+  ; CHECK-NEXT:   [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[ATOMIC_CMPXCHG_WITH_SUCCESS]](s32)
+  ; CHECK-NEXT:   [[INT:%[0-9]+]]:_(s64) = G_INTRINSIC intrinsic(@llvm.amdgcn.if.break), [[ATOMIC_CMPXCHG_WITH_SUCCESS1]](s1), [[PHI]](s64)
+  ; CHECK-NEXT:   [[INT1:%[0-9]+]]:_(s1) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.loop), [[INT]](s64)
+  ; CHECK-NEXT:   G_BRCOND [[INT1]](s1), %bb.3
+  ; CHECK-NEXT:   G_BR %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3.atomicrmw.end:
+  ; CHECK-NEXT:   [[PHI2:%[0-9]+]]:_(<2 x s16>) = G_PHI [[BITCAST2]](<2 x s16>), %bb.2
+  ; CHECK-NEXT:   [[PHI3:%[0-9]+]]:_(s64) = G_PHI [[INT]](s64), %bb.2
+  ; CHECK-NEXT:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI3]](s64)
+  ; CHECK-NEXT:   $vgpr0 = COPY [[PHI2]](<2 x s16>)
+  ; CHECK-NEXT:   SI_RETURN implicit $vgpr0
+  %oldval = atomicrmw fmax ptr addrspace(3) %addr, <2 x half> <half 1.0, half 1.0> seq_cst
+  ret <2 x half> %oldval
+}
+
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
index 376fe79f542e36..647c5b568b7ad5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.v2f16.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x half> %data) {
   ; GFX940-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic
@@ -13,6 +14,18 @@ define amdgpu_ps void @flat_atomic_fadd_v2f16_no_rtn_intrinsic(ptr %ptr, <2 x ha
   ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
   ; GFX940-NEXT:   FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   S_ENDPGM 0
+  ;
+  ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_no_rtn_intrinsic
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   FLAT_ATOMIC_PK_ADD_F16 killed [[COPY3]], [[COPY]], 0, 0, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+  ; GFX12-NEXT:   S_ENDPGM 0
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
   ret void
 }
@@ -30,84 +43,21 @@ define amdgpu_ps <2 x half> @flat_atomic_fadd_v2f16_rtn_intrinsic(ptr %ptr, <2 x
   ; GFX940-NEXT:   [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
   ; GFX940-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
   ; GFX940-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
+  ;
+  ; GFX12-LABEL: name: flat_atomic_fadd_v2f16_rtn_intrinsic
+  ; GFX12: bb.0 (%ir-block.0):
+  ; GFX12-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
+  ; GFX12-NEXT: {{  $}}
+  ; GFX12-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX12-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX12-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX12-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+  ; GFX12-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[REG_SEQUENCE]]
+  ; GFX12-NEXT:   [[FLAT_ATOMIC_PK_ADD_F16_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_PK_ADD_F16_RTN killed [[COPY3]], [[COPY]], 0, 1, implicit $exec, implicit $flat_scr :: (volatile dereferenceable load store (s32) on %ir.ptr)
+  ; GFX12-NEXT:   $vgpr0 = COPY [[FLAT_ATOMIC_PK_ADD_F16_RTN]]
+  ; GFX12-NEXT:   SI_RETURN_TO_EPILOG $vgpr0
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
 
 declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p1.v2f16(ptr, <2 x half>)
-
-define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) {
-  ; GFX940-LABEL: name: flat_agent_atomic_fadd_ret_v2f16
-  ; GFX940: bb.0 (%ir-block.0):
-  ; GFX940-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
-  ; GFX940-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
-  ; GFX940-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr)
-  ; GFX940-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT: bb.1.atomicrmw.start:
-  ; GFX940-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT:   [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1
-  ; GFX940-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1
-  ; GFX940-NEXT:   [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1
-  ; GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
-  ; GFX940-NEXT:   [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr)
-  ; GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec
-  ; GFX940-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc
-  ; GFX940-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX940-NEXT:   S_BRANCH %bb.2
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT: bb.2.atomicrmw.end:
-  ; GFX940-NEXT:   [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[FLAT_ATOMIC_CMPSWAP_RTN]], %bb.1
-  ; GFX940-NEXT:   [[PHI3:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1
-  ; GFX940-NEXT:   SI_END_CF [[PHI3]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX940-NEXT:   $vgpr0 = COPY [[PHI2]]
-  ; GFX940-NEXT:   SI_RETURN implicit $vgpr0
-  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
-  ret <2 x half> %result
-}
-
-define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) {
-  ; GFX940-LABEL: name: flat_agent_atomic_fadd_noret_v2f16
-  ; GFX940: bb.0 (%ir-block.0):
-  ; GFX940-NEXT:   successors: %bb.1(0x80000000)
-  ; GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr2
-  ; GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-  ; GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-  ; GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
-  ; GFX940-NEXT:   [[COPY3:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
-  ; GFX940-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
-  ; GFX940-NEXT:   [[FLAT_LOAD_DWORD:%[0-9]+]]:vgpr_32 = FLAT_LOAD_DWORD [[COPY4]], 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %ir.ptr)
-  ; GFX940-NEXT:   [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT: bb.1.atomicrmw.start:
-  ; GFX940-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT:   [[PHI:%[0-9]+]]:sreg_64 = PHI [[S_MOV_B64_]], %bb.0, %4, %bb.1
-  ; GFX940-NEXT:   [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[FLAT_LOAD_DWORD]], %bb.0, %3, %bb.1
-  ; GFX940-NEXT:   [[V_PK_ADD_F16_:%[0-9]+]]:vgpr_32 = nofpexcept V_PK_ADD_F16 8, [[PHI1]], 8, [[COPY]], 0, 0, 0, 0, 0, implicit $mode, implicit $exec
-  ; GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:sreg_64 = REG_SEQUENCE killed [[V_PK_ADD_F16_]], %subreg.sub0, [[PHI1]], %subreg.sub1
-  ; GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
-  ; GFX940-NEXT:   [[FLAT_ATOMIC_CMPSWAP_RTN:%[0-9]+]]:vgpr_32 = FLAT_ATOMIC_CMPSWAP_RTN [[COPY3]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("agent") seq_cst seq_cst (s32) on %ir.ptr)
-  ; GFX940-NEXT:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64 = V_CMP_EQ_U32_e64 [[FLAT_ATOMIC_CMPSWAP_RTN]], [[PHI1]], implicit $exec
-  ; GFX940-NEXT:   [[SI_IF_BREAK:%[0-9]+]]:sreg_64 = SI_IF_BREAK killed [[V_CMP_EQ_U32_e64_]], [[PHI]], implicit-def dead $scc
-  ; GFX940-NEXT:   SI_LOOP [[SI_IF_BREAK]], %bb.1, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX940-NEXT:   S_BRANCH %bb.2
-  ; GFX940-NEXT: {{  $}}
-  ; GFX940-NEXT: bb.2.atomicrmw.end:
-  ; GFX940-NEXT:   [[PHI2:%[0-9]+]]:sreg_64 = PHI [[SI_IF_BREAK]], %bb.1
-  ; GFX940-NEXT:   SI_END_CF [[PHI2]], implicit-def dead $exec, implicit-def dead $scc, implicit $exec
-  ; GFX940-NEXT:   SI_RETURN
-  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
-  ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
new file mode 100644
index 00000000000000..1e7cf0e702a03f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -0,0 +1,5084 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefixes=GFX90,GFX908 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90,GFX90A %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB0_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
+}
+
+define void @flat_agent_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret <2 x bfloat> %result
+}
+
+define void @flat_agent_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX90:       ; %bb.0:
+; GFX90-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX90-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90-NEXT:    buffer_wbinvl1_vol
+; GFX90-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX90-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB3_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x half> @flat_system_atomic_fadd_ret_v2f16(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v6
+; GFX7-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v2, v3
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v2
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[6:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v2
+; GFX7-NEXT:    v_mov_b32_e32 v1, v3
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_system_atomic_fadd_ret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x half> %val seq_cst
+  ret <2 x half> %result
+}
+
+define void @flat_system_atomic_fadd_noret_v2f16(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v4, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v5, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v5, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_system_atomic_fadd_noret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x half> %val seq_cst
+  ret void
+}
+
+define <2 x bfloat> @flat_system_atomic_fadd_ret_v2bf16(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v6, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v2
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX7-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v4
+; GFX7-NEXT:    v_alignbit_b32 v3, v2, v3, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v2, v2, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v3
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v3
+; GFX7-NEXT:    v_mov_b32_e32 v1, v2
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v3
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v3, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v3
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[6:7] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_system_atomic_fadd_ret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val seq_cst
+  ret <2 x bfloat> %result
+}
+
+define void @flat_system_atomic_fadd_noret_v2bf16(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dword v3, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dword v3, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dword v3, v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_system_atomic_fadd_noret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <2 x bfloat> %val seq_cst
+  ret void
+}
+
+define <4 x half> @flat_agent_atomic_fadd_ret_v4f16(ptr %ptr, <4 x half> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 4, v0
+; GFX7-NEXT:    flat_load_dword v9, v[0:1]
+; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v10, v[6:7]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
+; GFX7-NEXT:    v_or_b32_e32 v5, v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v6
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[10:11], v[0:1], v[2:5] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_add_f16_sdwa v4, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v7, v3
+; GFX8-NEXT:    v_add_f16_sdwa v8, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v9, v6, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v9, v8
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX908-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX940-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX10-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX11-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX12-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <4 x half> %val syncscope("agent") seq_cst
+  ret <4 x half> %result
+}
+
+define <4 x half> @flat_system_atomic_fadd_ret_v4f16(ptr %ptr, <4 x half> %val) {
+; GFX7-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 4, v0
+; GFX7-NEXT:    flat_load_dword v9, v[0:1]
+; GFX7-NEXT:    v_addc_u32_e32 v7, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v10, v[6:7]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v8, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v9
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v9
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v9, v11
+; GFX7-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v4
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v11, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_or_b32_e32 v4, v10, v4
+; GFX7-NEXT:    v_or_b32_e32 v5, v2, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT:    v_add_f32_e32 v11, v11, v6
+; GFX7-NEXT:    v_add_f32_e32 v3, v3, v8
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v11, v11
+; GFX7-NEXT:    v_add_f32_e32 v10, v10, v7
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_add_f32_e32 v2, v2, v9
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v10, v10
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v12, v2
+; GFX7-NEXT:    v_lshlrev_b32_e32 v2, 16, v11
+; GFX7-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX7-NEXT:    v_or_b32_e32 v3, v12, v3
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[10:11], v[0:1], v[2:5] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[10:11], v[4:5]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v10
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v11
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v10
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_add_f16_sdwa v4, v7, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v7, v3
+; GFX8-NEXT:    v_add_f16_sdwa v8, v6, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v9, v6, v2
+; GFX8-NEXT:    v_or_b32_e32 v5, v5, v4
+; GFX8-NEXT:    v_or_b32_e32 v4, v9, v8
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v7, v5
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX908-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX908-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v4
+; GFX908-NEXT:    v_mov_b32_e32 v1, v5
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v4
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v5
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX940-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v4
+; GFX940-NEXT:    v_mov_b32_e32 v1, v5
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v7, v5
+; GFX10-NEXT:    v_mov_b32_e32 v6, v4
+; GFX10-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX10-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v4
+; GFX10-NEXT:    v_mov_b32_e32 v1, v5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX11-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_system_atomic_fadd_ret_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b64 v[4:5], v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_pk_add_f16 v5, v7, v3
+; GFX12-NEXT:    v_pk_add_f16 v4, v6, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr %ptr, <4 x half> %val seq_cst
+  ret <4 x half> %result
+}
+
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_offset(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x3ffc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[4:5]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x3ffc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x3ffc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1] offset:4092
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3ffc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1] offset:4092
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v4, v0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x3ffc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX940-NEXT:    flat_load_dword v0, v[0:1] offset:4092
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x3ffc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x3ffc, v3
+; GFX11-NEXT:    flat_load_b32 v0, v[4:5] offset:4092
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:16380
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:16380 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 4095
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
+}
+
+define <2 x half> @flat_agent_atomic_fadd_ret_v2f16_negoffset(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[4:5]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v1
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v7
+; GFX7-NEXT:    v_or_b32_e32 v7, v0, v1
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT:    v_or_b32_e32 v6, v8, v0
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[4:5], v[6:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v7
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v1, v0
+; GFX8-NEXT:    v_add_f16_sdwa v0, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v1, v2
+; GFX8-NEXT:    v_or_b32_e32 v0, v5, v0
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v4, v0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
+; GFX940-NEXT:    flat_load_dword v0, v[0:1]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, v0
+; GFX940-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v1, v0
+; GFX10-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[0:1] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
+}
+
+define void @flat_agent_atomic_fadd_noret_v2f16_offset(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x3ffc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x3ffc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x3ffc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1] offset:4092
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3ffc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1] offset:4092
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3000, v0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x3ffc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX940-NEXT:    flat_load_dword v5, v[4:5] offset:4092
+; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3ffc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x3000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3ffc, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v4, v[3:4] offset:4092
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:16380
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:16380 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 4095
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_v2f16_negoffset(ptr %ptr, <2 x half> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v6, v2
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v2, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v3
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v3, v6
+; GFX7-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v5, v5
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v7, v4
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v8, v6
+; GFX7-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; GFX7-NEXT:    v_or_b32_e32 v6, v4, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v4, 16, v8
+; GFX7-NEXT:    v_or_b32_e32 v5, v7, v4
+; GFX7-NEXT:    flat_atomic_cmpswap v7, v[0:1], v[5:6] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v4, v7
+; GFX7-NEXT:    v_cvt_f32_f16_e32 v5, v5
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_f16_sdwa v3, v4, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; GFX8-NEXT:    v_add_f16_e32 v5, v4, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v5, v3
+; GFX8-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, v3
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v0, v1, v2
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX940-NEXT:    flat_load_dword v5, v[4:5]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:    flat_load_dword v4, v[0:1]
+; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v4, v[3:4]
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v4, v[0:1] offset:-2048
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr %ptr, i64 -512
+  %result = atomicrmw fadd ptr %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_offset(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0x3ffc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0x3ffc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x3ffc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1] offset:4092
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3ffc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1] offset:4092
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v4, v0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v4
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x3ffc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX940-NEXT:    flat_load_dword v0, v[0:1] offset:4092
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v3, v0, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0x3ffc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0x3000, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x3ffc, v3
+; GFX11-NEXT:    flat_load_b32 v0, v[4:5] offset:4092
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:16380
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:16380 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 4095
+  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16_negoffset(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; GFX7-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v1
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v0
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v0, v0, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v0, v[3:4]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX8-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX8-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v0
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v5
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX8-NEXT:    v_alignbit_b32 v5, v5, v0, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v0, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v0, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v3, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v3, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v4, v0
+; GFX940-NEXT:    v_mov_b32_e32 v5, v1
+; GFX940-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v4
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
+; GFX940-NEXT:    flat_load_dword v0, v[0:1]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v3, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v0, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v6, v6, v0, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v3, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v0, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v3, v0, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[6:7] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    flat_load_dword v0, v[3:4]
+; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v0
+; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v0, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v3, v0
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
+; GFX11-NEXT:    flat_load_b32 v0, v[4:5]
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v0, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v0, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v0, v[3:4], v[5:6] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v3, v[0:1], v[5:6] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
+  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret <2 x bfloat> %result
+}
+
+define void @flat_agent_atomic_fadd_noret_v2bf16_offset(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0x3ffc, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0x3ffc, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0x3ffc, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1] offset:4092
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3ffc, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0x3000, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1] offset:4092
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0x3000, v0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0x3ffc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX940-NEXT:    flat_load_dword v3, v[4:5] offset:4092
+; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3ffc, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0x3000, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0x3ffc, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4] offset:4092
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:16380
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16380 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 4095
+  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define void @flat_agent_atomic_fadd_noret_v2bf16_negoffset(ptr %ptr, <2 x bfloat> %val) {
+; GFX7-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xfffff800, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX7-NEXT:    flat_load_dword v5, v[0:1]
+; GFX7-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX7-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT:    v_mul_f32_e32 v5, 1.0, v5
+; GFX7-NEXT:    v_and_b32_e32 v7, 0xffff0000, v4
+; GFX7-NEXT:    v_and_b32_e32 v6, 0xffff0000, v5
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX7-NEXT:    v_add_f32_e32 v7, v7, v3
+; GFX7-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX7-NEXT:    v_alignbit_b32 v5, v4, v5, 16
+; GFX7-NEXT:    v_lshrrev_b32_e32 v4, 16, v7
+; GFX7-NEXT:    v_alignbit_b32 v4, v4, v6, 16
+; GFX7-NEXT:    flat_atomic_cmpswap v6, v[0:1], v[4:5] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, v6, v5
+; GFX7-NEXT:    v_and_b32_e32 v4, 0xffff0000, v6
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_lshlrev_b32_e32 v5, 16, v6
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, -1, v1, vcc
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[6:7], 0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX8-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX8-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX8-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX8-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX8-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v2
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v6
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 0x7fff, v7
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 0x7fff, v9
+; GFX8-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX8-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX8-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX8-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
+; GFX8-NEXT:    v_alignbit_b32 v2, v6, v2, 16
+; GFX8-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX8-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v2
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
+; GFX908-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT:    flat_load_dword v1, v[0:1]
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v5, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX908-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX908-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v0, v6, v0, s9
+; GFX908-NEXT:    flat_atomic_cmpswap v0, v[3:4], v[0:1] glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v0
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX90A-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT:    flat_load_dword v1, v[0:1]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v1
+; GFX90A-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v2
+; GFX90A-NEXT:    v_bfe_u32 v7, v0, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v0
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v0, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v0, v0
+; GFX90A-NEXT:    v_cndmask_b32_e64 v0, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v0, v6, v0, s9
+; GFX90A-NEXT:    flat_atomic_cmpswap v0, v[4:5], v[0:1] glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v1, v0
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
+; GFX940-NEXT:    s_movk_i32 s0, 0xf800
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
+; GFX940-NEXT:    flat_load_dword v3, v[4:5]
+; GFX940-NEXT:    s_mov_b32 s1, -1
+; GFX940-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    flat_load_dword v3, v[0:1]
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, -1, v1, vcc_lo
+; GFX11-NEXT:    flat_load_b32 v3, v[3:4]
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1] offset:-2048
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:-2048 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr %ptr, i64 -512
+  %result = atomicrmw fadd ptr %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 50a27d42322d7a..16ddf91ebf8f02 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs | FileCheck %s -check-prefix=GFX940
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefix=GFX12
 
 declare float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
 declare <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
@@ -20,6 +21,15 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret(ptr %ptr, float %data) {
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    flat_atomic_add_f32 v[0:1], v2
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_atomic_fadd_f32_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    flat_atomic_add_f32 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
   ret void
 }
@@ -36,6 +46,29 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:  .LBB1_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_add_f32_e32 v2, 4.0, v3
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB1_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_endpgm
   %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret void
 }
@@ -52,6 +85,29 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    flat_load_b32 v3, v[0:1]
+; GFX12-NEXT:  .LBB2_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_add_f32_e32 v2, 4.0, v3
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB2_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_endpgm
   %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret void
 }
@@ -63,6 +119,17 @@ define float @flat_atomic_fadd_f32_rtn(ptr %ptr, float %data) {
 ; GFX940-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_atomic_fadd_f32_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_atomic_add_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call float @llvm.amdgcn.flat.atomic.fadd.f32.p0.f32(ptr %ptr, float %data)
   ret float %ret
 }
@@ -77,6 +144,35 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_atomic_fadd_f32_rtn_pat:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_load_b32 v2, v[0:1]
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v2, 4.0, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
   ret float %ret
 }
@@ -91,6 +187,15 @@ define amdgpu_kernel void @flat_atomic_fadd_v2f16_noret(ptr %ptr, <2 x half> %da
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_atomic_fadd_v2f16_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
   ret void
 }
@@ -102,6 +207,17 @@ define <2 x half> @flat_atomic_fadd_v2f16_rtn(ptr %ptr, <2 x half> %data) {
 ; GFX940-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_atomic_fadd_v2f16_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_f16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.flat.atomic.fadd.v2f16.p0.v2f16(ptr %ptr, <2 x half> %data)
   ret <2 x half> %ret
 }
@@ -116,6 +232,15 @@ define amdgpu_kernel void @flat_atomic_fadd_v2bf16_noret(ptr %ptr, <2 x i16> %da
 ; GFX940-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX940-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: flat_atomic_fadd_v2bf16_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v[0:1], v2
+; GFX12-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
   ret void
 }
@@ -127,6 +252,17 @@ define <2 x i16> @flat_atomic_fadd_v2bf16_rtn(ptr %ptr, <2 x i16> %data) {
 ; GFX940-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: flat_atomic_fadd_v2bf16_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    flat_atomic_pk_add_bf16 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.flat.atomic.fadd.v2bf16.p0(ptr %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
 }
@@ -141,6 +277,16 @@ define amdgpu_kernel void @global_atomic_fadd_v2bf16_noret(ptr addrspace(1) %ptr
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[2:3]
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_v2bf16_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
   ret void
 }
@@ -152,6 +298,17 @@ define <2 x i16> @global_atomic_fadd_v2bf16_rtn(ptr addrspace(1) %ptr, <2 x i16>
 ; GFX940-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off sc0
 ; GFX940-NEXT:    s_waitcnt vmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_atomic_fadd_v2bf16_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_atomic_pk_add_bf16 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.global.atomic.fadd.v2bf16.p1(ptr addrspace(1) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
 }
@@ -165,6 +322,14 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 ; GFX940-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX940-NEXT:    ds_pk_add_f16 v0, v1
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: local_atomic_fadd_v2f16_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ds_pk_add_f16 v0, v1
+; GFX12-NEXT:    s_endpgm
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret void
 }
@@ -176,6 +341,17 @@ define <2 x half> @local_atomic_fadd_v2f16_rtn(ptr addrspace(3) %ptr, <2 x half>
 ; GFX940-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: local_atomic_fadd_v2f16_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_pk_add_rtn_f16 v0, v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x half> @llvm.amdgcn.ds.fadd.v2f16(ptr addrspace(3) %ptr, <2 x half> %data, i32 0, i32 0, i1 0)
   ret <2 x half> %ret
 }
@@ -192,6 +368,16 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: local_atomic_fadd_v2bf16_noret:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ds_pk_add_bf16 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
   %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret void
 }
@@ -205,6 +391,19 @@ define <2 x i16> @local_atomic_fadd_v2bf16_rtn(ptr addrspace(3) %ptr, <2 x i16>
 ; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX940-NEXT:    buffer_inv sc0 sc1
 ; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: local_atomic_fadd_v2bf16_rtn:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_pk_add_rtn_bf16 v0, v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = call <2 x i16> @llvm.amdgcn.ds.fadd.v2bf16(ptr addrspace(3) %ptr, <2 x i16> %data)
   ret <2 x i16> %ret
 }
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index d10e049444d685..88ce07c9779d0a 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -2,8 +2,10 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX940 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 
 define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0 {
 ; GFX900-LABEL: global_atomic_fadd_ret_f32:
@@ -131,6 +133,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
 ; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_ret_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    ; implicit-def: $vgpr1
+; GFX940-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB0_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NEXT:    v_mul_f32_e32 v2, 4.0, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v1, v1, v2, s[0:1] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:  .LBB0_2:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX940-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX940-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
@@ -214,6 +246,48 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(ptr addrspace(1) %ptr) #0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_ret_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB0_4
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b32 s5, s[0:1], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:  .LBB0_2: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX12-NEXT:  ; %bb.3: ; %Flow
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:  .LBB0_4: ; %Flow1
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 seq_cst
   store float %result, ptr addrspace(1) undef
   ret void
@@ -332,6 +406,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 ; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_ret_f32_ieee:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    ; implicit-def: $vgpr1
+; GFX940-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB1_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NEXT:    v_mul_f32_e32 v2, 4.0, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v1, v1, v2, s[0:1] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:  .LBB1_2:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX940-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX940-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32_ieee:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
@@ -402,6 +506,34 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(ptr addrspace(1) %ptr
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_ret_f32_ieee:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB1_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v1, v2, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:  .LBB1_2:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   store float %result, ptr addrspace(1) undef
   ret void
@@ -483,6 +615,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ; GFX90A-NEXT:  .LBB2_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_noret_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB2_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s2
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:  .LBB2_2:
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_noret_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s3, exec_lo
@@ -535,6 +689,25 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(ptr addrspace(1) %ptr) #
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB2_2:
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_noret_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB2_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:  .LBB2_2:
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
@@ -615,6 +788,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ; GFX90A-NEXT:  .LBB3_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_noret_f32_ieee:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB3_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s2
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:  .LBB3_2:
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_noret_f32_ieee:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s3, exec_lo
@@ -667,6 +862,25 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(ptr addrspace(1) %p
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:  .LBB3_2:
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_noret_f32_ieee:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB3_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, s2
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mul_f32 v0, 4.0, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:  .LBB3_2:
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
@@ -781,6 +995,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 ; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_ret_f32_agent:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    ; implicit-def: $vgpr1
+; GFX940-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB4_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NEXT:    v_mul_f32_e32 v2, 4.0, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v1, v1, v2, s[0:1] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:  .LBB4_2:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX940-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX940-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32_agent:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
@@ -850,6 +1094,34 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(ptr addrspace(1) %pt
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_ret_f32_agent:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB4_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s3
+; GFX12-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mul_f32 v1, 4.0, v1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v1, v2, v1, s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:  .LBB4_2:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   store float %result, ptr addrspace(1) undef
   ret void
@@ -981,6 +1253,36 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
 ; GFX90A-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_ret_f32_system:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    ; implicit-def: $vgpr1
+; GFX940-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB5_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v2, s4
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NEXT:    v_mul_f32_e32 v2, 4.0, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v1, v1, v2, s[0:1] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:  .LBB5_2:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX940-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX940-NEXT:    global_store_dword v[0:1], v0, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_ret_f32_system:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s4, exec_lo
@@ -1064,6 +1366,48 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_ret_f32_system:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB5_4
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s4, s4
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b32 s5, s[0:1], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mul_f32 v2, 4.0, v1 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:  .LBB5_2: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v1, v3, v[4:5], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX12-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX12-NEXT:  ; %bb.3: ; %Flow
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:  .LBB5_4: ; %Flow1
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s2
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX12-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX12-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("one-as") seq_cst
   store float %result, ptr addrspace(1) undef
   ret void
@@ -1151,6 +1495,47 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(ptr addrsp
 ; GFX11-NEXT:    v_mad_f32 v0, v0, 4.0, s0
 ; GFX11-NEXT:    global_store_dword v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_ret_f32_wrong_subtarget:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b64 s[4:5], exec
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX12-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX12-NEXT:    s_cbranch_execz .LBB6_4
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b64 s7, s[4:5]
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s7
+; GFX12-NEXT:    s_mov_b64 s[4:5], 0
+; GFX12-NEXT:    v_mul_f32_e32 v2, 4.0, v1
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    v_mov_b32_e32 v1, s6
+; GFX12-NEXT:  .LBB6_2: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_mov_b32_e32 v5, v1
+; GFX12-NEXT:    v_add_f32_e32 v4, v5, v2
+; GFX12-NEXT:    global_atomic_cmpswap v1, v3, v[4:5], s[0:1] glc
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_wbinvl1_vol
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v5
+; GFX12-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX12-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX12-NEXT:  ; %bb.3: ; %Flow
+; GFX12-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX12-NEXT:  .LBB6_4: ; %Flow1
+; GFX12-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX12-NEXT:    v_mad_f32 v0, v0, 4.0, s0
+; GFX12-NEXT:    global_store_dword v[0:1], v0, off
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   store float %result, ptr addrspace(1) undef
   ret void
@@ -1198,6 +1583,27 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(ptr addr
 ; GFX11-NEXT:    buffer_wbinvl1_vol
 ; GFX11-NEXT:  .LBB7_2:
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_noret_f32_wrong_subtarget:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b64 s[2:3], exec
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX12-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX12-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX12-NEXT:    s_cbranch_execz .LBB7_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX12-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX12-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_waitcnt vmcnt(0)
+; GFX12-NEXT:    buffer_wbinvl1_vol
+; GFX12-NEXT:  .LBB7_2:
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
@@ -1302,6 +1708,28 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX90A-NEXT:  .LBB8_3:
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_noret_f32_safe:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB8_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s2
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:  .LBB8_2:
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_noret_f32_safe:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s3, exec_lo
@@ -1366,6 +1794,37 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_safe(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_cbranch_execnz .LBB8_2
 ; GFX11-NEXT:  .LBB8_3:
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_noret_f32_safe:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-NEXT:    s_mov_b32 s2, 0
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB8_3
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s3, s3
+; GFX12-NEXT:    v_mov_b32_e32 v3, 0
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b32 s4, s[0:1], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mul_f32 v2, 4.0, v0 :: v_dual_mov_b32 v1, s4
+; GFX12-NEXT:  .LBB8_2: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v0, v3, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX12-NEXT:    v_mov_b32_e32 v1, v0
+; GFX12-NEXT:    s_or_b32 s2, vcc_lo, s2
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX12-NEXT:  .LBB8_3:
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, float 4.0 syncscope("agent") seq_cst
   ret void
 }
@@ -1444,6 +1903,26 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
 ; GFX90A-NEXT:  .LBB9_2:
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: infer_as_before_atomic:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_mov_b64 s[2:3], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB9_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s2
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX940-NEXT:  .LBB9_2:
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: infer_as_before_atomic:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s3, exec_lo
@@ -1496,6 +1975,27 @@ define amdgpu_kernel void @infer_as_before_atomic(ptr addrspace(4) %arg) #0 {
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: infer_as_before_atomic:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_mov_b32 s2, exec_lo
+; GFX12-NEXT:    s_mov_b32 s3, exec_lo
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB9_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_bcnt1_i32_b32 s2, s2
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s2
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX12-NEXT:  .LBB9_2:
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %load = load ptr, ptr addrspace(4) %arg
   %v = atomicrmw fadd ptr %load, float 1.0 syncscope("agent-one-as") monotonic, align 4
   ret void
@@ -1625,6 +2125,50 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX90A-NEXT:    global_store_short v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_ret_bf16_agent:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_and_b32 s2, s6, -4
+; GFX940-NEXT:    s_mov_b32 s3, s7
+; GFX940-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX940-NEXT:    s_and_b32 s5, s6, 3
+; GFX940-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX940-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX940-NEXT:    s_not_b32 s6, s6
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX940-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s6, v1
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[2:3] sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX940-NEXT:    global_store_short v[0:1], v0, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_ret_bf16_agent:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -1711,6 +2255,50 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_agent(ptr addrspace(1) %p
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_ret_bf16_agent:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_b32 s0, s2, -4
+; GFX12-NEXT:    s_mov_b32 s1, s3
+; GFX12-NEXT:    s_and_b32 s2, s2, 3
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX12-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX12-NEXT:    s_not_b32 s4, s4
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s2, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX12-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX12-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX12-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat 4.0 syncscope("agent") seq_cst
   store bfloat %result, ptr addrspace(1) undef
   ret void
@@ -1842,6 +2430,50 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX90A-NEXT:    global_store_short v[0:1], v0, off
 ; GFX90A-NEXT:    s_endpgm
 ;
+; GFX940-LABEL: global_atomic_fadd_ret_bf16_system:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x24
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_mov_b32_e32 v0, 0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_and_b32 s2, s6, -4
+; GFX940-NEXT:    s_mov_b32 s3, s7
+; GFX940-NEXT:    s_load_dword s7, s[2:3], 0x0
+; GFX940-NEXT:    s_and_b32 s5, s6, 3
+; GFX940-NEXT:    s_lshl_b32 s5, s5, 3
+; GFX940-NEXT:    s_lshl_b32 s6, 0xffff, s5
+; GFX940-NEXT:    s_not_b32 s6, s6
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v1, s7
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    v_mov_b32_e32 v3, v1
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v1, s5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX940-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX940-NEXT:    v_add3_u32 v2, v2, v1, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v2, v3, s6, v1
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v1, v0, v[2:3], s[2:3] sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, s5, v1
+; GFX940-NEXT:    global_store_short v[0:1], v0, off sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: global_atomic_fadd_ret_bf16_system:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
@@ -1928,13 +2560,57 @@ define amdgpu_kernel void @global_atomic_fadd_ret_bf16_system(ptr addrspace(1) %
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX12-LABEL: global_atomic_fadd_ret_bf16_system:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_b32 s0, s2, -4
+; GFX12-NEXT:    s_mov_b32 s1, s3
+; GFX12-NEXT:    s_and_b32 s2, s2, 3
+; GFX12-NEXT:    s_load_b32 s3, s[0:1], 0x0
+; GFX12-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX12-NEXT:    s_lshl_b32 s4, 0xffff, s2
+; GFX12-NEXT:    s_not_b32 s4, s4
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s3
+; GFX12-NEXT:    s_mov_b32 s3, 0
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    v_mov_b32_e32 v2, v1
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, s2, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-NEXT:    v_add_f32_e32 v1, 4.0, v1
+; GFX12-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc_lo
+; GFX12-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX12-NEXT:    v_lshlrev_b32_e32 v1, s2, v1
+; GFX12-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-NEXT:    s_or_b32 s3, vcc_lo, s3
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, s2, v1
+; GFX12-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat 4.0 syncscope("one-as") seq_cst
   store bfloat %result, ptr addrspace(1) undef
   ret void
 }
 
-define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) {
-; GFX900-LABEL: global_atomic_fadd_ret_v2f16:
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_ret_v2f16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-NEXT:    global_load_dword v3, v[0:1], off
@@ -1956,7 +2632,7 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_atomic_fadd_ret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX908-NEXT:    global_load_dword v3, v[0:1], off
@@ -1978,7 +2654,7 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_atomic_fadd_ret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
@@ -2000,7 +2676,30 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_atomic_fadd_ret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    global_load_dword v3, v[0:1], off
@@ -2024,7 +2723,7 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_atomic_fadd_ret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
@@ -2047,380 +2746,4819 @@ define <2 x half> @global_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB12_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB12_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
   ret <2 x half> %result
 }
 
-define void @global_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) {
-; GFX900-LABEL: global_atomic_fadd_noret_v2f16:
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_ret_v2f16_offset:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_load_dword v4, v[0:1], off
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
 ; GFX900-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX900-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
 ; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX900-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX900-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_atomic_fadd_noret_v2f16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16_offset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:1024
 ; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
 ; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX908-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_atomic_fadd_noret_v2f16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16_offset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
 ; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
 ; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_atomic_fadd_noret_v2f16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:1024
 ; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_atomic_fadd_noret_v2f16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB13_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
-; GFX11-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB13_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
-  ret void
+;
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB13_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB13_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
 }
 
-define <2 x bfloat> @global_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
-; GFX900-LABEL: global_atomic_fadd_ret_v2bf16:
+define <2 x half> @global_agent_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_load_dword v3, v[0:1], off
-; GFX900-NEXT:    s_mov_b64 s[6:7], 0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX900-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_mov_b32_e32 v6, v3
-; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX900-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s8
-; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX900-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
-; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX900-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX900-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_atomic_fadd_ret_v2bf16:
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
-; GFX908-NEXT:    s_mov_b64 s[6:7], 0
-; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_mov_b32_e32 v6, v3
-; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
-; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
-; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
-; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
-; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
-; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX908-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_atomic_fadd_ret_v2bf16:
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
-; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
-; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
-; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
-; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
-; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
-; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
-; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
-; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
-; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
-; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
-; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
-; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
-; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
-; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_atomic_fadd_ret_v2bf16:
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
 ; GFX10-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_mov_b32_e32 v6, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
-; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
-; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
 ; GFX10-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_atomic_fadd_ret_v2bf16:
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
-; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:  .LBB14_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_mov_b32_e32 v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
-; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
-; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
-; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
-; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
-; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
-; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
-; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
-; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
-; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
-; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
-; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
-; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
-; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_execnz .LBB14_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
-; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
-  ret <2 x bfloat> %result
+;
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2f16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret <2 x half> %result
 }
 
-define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
-; GFX900-LABEL: global_atomic_fadd_noret_v2bf16:
+define void @global_agent_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_noret_v2f16:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    global_load_dword v3, v[0:1], off
-; GFX900-NEXT:    s_mov_b64 s[6:7], 0
-; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
-; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
-; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:    global_load_dword v4, v[0:1], off
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
 ; GFX900-NEXT:  .LBB15_1: ; %atomicrmw.start
 ; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
-; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
-; GFX900-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX900-NEXT:    v_add_f32_e32 v6, v6, v5
-; GFX900-NEXT:    v_bfe_u32 v7, v2, 16, 1
-; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
-; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v2
-; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
-; GFX900-NEXT:    v_add3_u32 v7, v7, v2, s8
-; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s8
-; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
-; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX900-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
-; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
-; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX900-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    buffer_wbinvl1_vol
-; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_noret_v2f16_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v4, v[0:1], off offset:1024
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:1024
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:1024
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:1024
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v4, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-1024
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2f16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:-1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x half> @global_system_atomic_fadd_ret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_ret_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val seq_cst
+  ret <2 x half> %result
+}
+
+define <2 x half> @global_system_atomic_fadd_ret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_ret_v2f16_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  ret <2 x half> %result
+}
+
+define <2 x half> @global_system_atomic_fadd_ret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2f16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  ret <2 x half> %result
+}
+
+define void @global_system_atomic_fadd_noret_v2f16(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_noret_v2f16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v4, v[0:1], off
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %val seq_cst
+  ret void
+}
+
+define void @global_system_atomic_fadd_noret_v2f16_offset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_noret_v2f16_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v4, v[0:1], off offset:1024
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:1024
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:1024
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:1024
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB22_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB22_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  ret void
+}
+
+define void @global_system_atomic_fadd_noret_v2f16_negoffset(ptr addrspace(1) %ptr, <2 x half> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v4, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[4:5], 0
+; GFX900-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX900-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX900-NEXT:    v_mov_b32_e32 v4, v3
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v4, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v5, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v5, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v4, v5, v2
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[4:5], off offset:-1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v5, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off offset:-1024
+; GFX10-NEXT:    s_mov_b32 s4, 0
+; GFX10-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v4, v3
+; GFX10-NEXT:    s_or_b32 s4, vcc_lo, s4
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v4, v[0:1], off offset:-1024
+; GFX11-NEXT:    s_mov_b32 s0, 0
+; GFX11-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-NEXT:    v_mov_b32_e32 v4, v3
+; GFX11-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2f16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v4, v[0:1], off offset:-1024
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB23_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v4, v2
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB23_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x half>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x half> %val seq_cst
+  ret void
+}
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB24_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB24_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB25_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB25_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_agent_atomic_fadd_ret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_ret_v2bf16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB26_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB26_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret <2 x bfloat> %result
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX900-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX900-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB27_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB27_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX900-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX900-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB28_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB28_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define void @global_agent_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX900-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX900-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 sc0
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_agent_atomic_fadd_noret_v2bf16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB29_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB29_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_ret_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB30_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB30_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val seq_cst
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_ret_v2bf16_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16_offset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16_offset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16_offset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16_offset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB31_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB31_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @global_system_atomic_fadd_ret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v6, v3
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX900-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX900-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX900-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX900-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX900-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX900-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v0, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v3
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX908-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX908-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v7, v7, v3, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e64 v3, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v5, v5, v3, s9
+; GFX908-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v6
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v7, v3
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX90A-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX90A-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v3, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v6, v5, v3, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v7, v3
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v7
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v7
+; GFX940-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX940-NEXT:    v_bfe_u32 v6, v3, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v3
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v3, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v3, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v3, v6, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v6, v5, v3, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v3, v[0:1], v[6:7], off offset:-1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v7
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_mov_b32_e32 v6, v3
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX10-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX10-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX10-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX10-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    v_mov_b32_e32 v0, v3
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v6, v3
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX11-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX11-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX11-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX11-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX11-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX11-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX11-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    v_mov_b32_e32 v0, v3
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_ret_v2bf16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB32_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v6, v3
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v6
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v6
+; GFX12-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX12-NEXT:    v_add_f32_e32 v3, v3, v4
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v8, v8, v5, 0x7fff
+; GFX12-NEXT:    v_bfe_u32 v7, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v3, v3
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v8, v10, vcc_lo
+; GFX12-NEXT:    v_add3_u32 v7, v7, v3, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e64 v3, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v5, v5, v3, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v3, v[0:1], v[5:6], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB32_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  ret <2 x bfloat> %result
+}
+
+define void @global_system_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_noret_v2bf16:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX900-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX900-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB33_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB33_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val seq_cst
+  ret void
+}
+
+define void @global_system_atomic_fadd_noret_v2bf16_offset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_noret_v2bf16_offset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX900-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX900-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX900-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX900-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX900-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX908-LABEL: global_atomic_fadd_noret_v2bf16:
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16_offset:
 ; GFX908:       ; %bb.0:
 ; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX908-NEXT:    global_load_dword v3, v[0:1], off
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:1024
 ; GFX908-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -2438,28 +7576,28 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
 ; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    buffer_wbinvl1_vol
 ; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX908-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX908-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX90A-LABEL: global_atomic_fadd_noret_v2bf16:
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16_offset:
 ; GFX90A:       ; %bb.0:
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90A-NEXT:    global_load_dword v3, v[0:1], off
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:1024
 ; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
 ; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -2477,26 +7615,69 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
 ; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
 ; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
 ; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
-; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
 ; GFX90A-NEXT:    buffer_wbinvl1_vol
 ; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
 ; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
 ; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GFX90A-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX10-LABEL: global_atomic_fadd_noret_v2bf16:
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16_offset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16_offset:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    global_load_dword v3, v[0:1], off
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:1024
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX10-NEXT:    s_mov_b32 s5, 0
-; GFX10-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX10-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -2515,7 +7696,7 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
 ; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    buffer_gl1_inv
 ; GFX10-NEXT:    buffer_gl0_inv
@@ -2523,21 +7704,21 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
 ; GFX10-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
 ; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
-; GFX10-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX10-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: global_atomic_fadd_noret_v2bf16:
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16_offset:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    global_load_b32 v3, v[0:1], off
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
 ; GFX11-NEXT:    s_mov_b32 s1, 0
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
 ; GFX11-NEXT:    .p2align 6
-; GFX11-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX11-NEXT:  .LBB34_1: ; %atomicrmw.start
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
@@ -2556,7 +7737,7 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
 ; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off glc
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 glc
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_gl1_inv
 ; GFX11-NEXT:    buffer_gl0_inv
@@ -2564,12 +7745,343 @@ define void @global_atomic_fadd_noret_v2bf16(ptr addrspace(1) %ptr, <2 x bfloat>
 ; GFX11-NEXT:    v_mov_b32_e32 v3, v2
 ; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
 ; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
-; GFX11-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX11-NEXT:    s_cbranch_execnz .LBB34_1
 ; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
 ; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
-  %result = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %val syncscope("agent") seq_cst
+;
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16_offset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB34_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB34_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
+  ret void
+}
+
+define void @global_system_atomic_fadd_noret_v2bf16_negoffset(ptr addrspace(1) %ptr, <2 x bfloat> %val) {
+; GFX900-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX900-NEXT:    s_mov_b64 s[6:7], 0
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX900-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX900-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX900-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX900-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX900-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX900-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX900-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX900-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX900-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX900-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX900-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX900-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX900-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX900-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX900-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX900-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX900-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX900-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX900-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    buffer_wbinvl1_vol
+; GFX900-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX900-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX900-NEXT:    v_mov_b32_e32 v3, v2
+; GFX900-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX900-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX908-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX908-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX908-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX908-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX908-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX908-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    buffer_wbinvl1_vol
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX90A-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX90A-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX90A-NEXT:    v_add3_u32 v7, v7, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v9, v9, v6, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v6, v2, s9
+; GFX90A-NEXT:    buffer_wbl2
+; GFX90A-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX90A-NEXT:    s_waitcnt vmcnt(0)
+; GFX90A-NEXT:    buffer_invl2
+; GFX90A-NEXT:    buffer_wbinvl1_vol
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX940-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX940-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v9, v6, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX940-NEXT:    v_add3_u32 v7, v7, v2, s4
+; GFX940-NEXT:    v_add3_u32 v9, v9, v6, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v6, v6
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v6, v9, v10, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v7, v8, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v6, v2, s5
+; GFX940-NEXT:    buffer_wbl2 sc0 sc1
+; GFX940-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 sc0 sc1
+; GFX940-NEXT:    s_waitcnt vmcnt(0)
+; GFX940-NEXT:    buffer_inv sc0 sc1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX10-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT:    global_load_dword v3, v[0:1], off offset:-1024
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX10-NEXT:    s_mov_b32 s5, 0
+; GFX10-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX10-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX10-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX10-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX10-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX10-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX10-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX10-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX10-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX10-NEXT:    v_cmp_u_f32_e64 s4, v2, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s4
+; GFX10-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX10-NEXT:    global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    buffer_gl1_inv
+; GFX10-NEXT:    buffer_gl0_inv
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, v2
+; GFX10-NEXT:    s_or_b32 s5, vcc_lo, s5
+; GFX10-NEXT:    s_andn2_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX10-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX10-NEXT:    s_or_b32 exec_lo, exec_lo, s5
+; GFX10-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX11-NEXT:    s_mov_b32 s1, 0
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x1
+; GFX11-NEXT:    .p2align 6
+; GFX11-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX11-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX11-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX11-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX11-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX11-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX11-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX11-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX11-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX11-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 glc
+; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    buffer_gl1_inv
+; GFX11-NEXT:    buffer_gl0_inv
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX11-NEXT:    v_mov_b32_e32 v3, v2
+; GFX11-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX11-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX11-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX11-NEXT:    s_set_inst_prefetch_distance 0x2
+; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: global_system_atomic_fadd_noret_v2bf16_negoffset:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v3, v[0:1], off offset:-1024
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v2
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v2
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB35_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
+; GFX12-NEXT:    v_and_b32_e32 v6, 0xffff0000, v3
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v4
+; GFX12-NEXT:    v_add_f32_e32 v6, v6, v5
+; GFX12-NEXT:    v_bfe_u32 v7, v2, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v8, v6, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v2
+; GFX12-NEXT:    v_or_b32_e32 v10, 0x400000, v6
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v6, v6
+; GFX12-NEXT:    v_add3_u32 v7, v7, v2, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v8, v8, v6, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    v_cndmask_b32_e32 v6, v8, v10, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v7, v9, s0
+; GFX12-NEXT:    v_perm_b32 v2, v6, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:-1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB35_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr <2 x bfloat>, ptr addrspace(1) %ptr, i32 -256
+  %result = atomicrmw fadd ptr addrspace(1) %gep, <2 x bfloat> %val seq_cst
   ret void
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 4d8c6aa6e1d9ea..5e5f8d2ba834aa 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -886,12 +886,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GCN-NEXT:    v_writelane_b32 v40, s30, 30
 ; GCN-NEXT:    v_writelane_b32 v40, s31, 31
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v1
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GCN-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GCN-NEXT:    s_movk_i32 s4, 0x7b
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[10:11]
@@ -980,12 +980,12 @@ define void @test_indirect_call_vgpr_ptr_inreg_arg(ptr %fptr) {
 ; GISEL-NEXT:    v_writelane_b32 v40, s30, 30
 ; GISEL-NEXT:    v_writelane_b32 v40, s31, 31
 ; GISEL-NEXT:    s_mov_b64 s[6:7], exec
-; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:  .LBB6_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[8:9], v[0:1]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[10:11], vcc
+; GISEL-NEXT:    s_movk_i32 s4, 0x7b
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[8:9]
 ; GISEL-NEXT:    ; implicit-def: $vgpr0
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[10:11]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
new file mode 100644
index 00000000000000..de484e3db18ab5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.load-last-use.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;RUN: llc < %s -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefix=GCN
+;RUN: llc < %s -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-delay-alu=0 | FileCheck %s --check-prefix=GCN
+
+define float @raw_buffer_load(<4 x i32> inreg) {
+; GCN-LABEL: raw_buffer_load:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    buffer_load_b32 v0, off, s[0:3], null th:TH_LOAD_LU
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %data = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 3)
+  ret float %data
+}
+
+define float @struct_buffer_load(<4 x i32> inreg) {
+; GCN-LABEL: struct_buffer_load:
+; GCN:       ; %bb.0: ; %main_body
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_expcnt 0x0
+; GCN-NEXT:    s_wait_samplecnt 0x0
+; GCN-NEXT:    s_wait_bvhcnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_mov_b32_e32 v0, 0
+; GCN-NEXT:    buffer_load_b32 v0, v0, s[0:3], null idxen th:TH_LOAD_LU
+; GCN-NEXT:    s_wait_loadcnt 0x0
+; GCN-NEXT:    s_setpc_b64 s[30:31]
+main_body:
+  %data = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %0, i32 0, i32 0, i32 0, i32 3)
+  ret float %data
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
index 89abdb2b754a44..4373b76070e32a 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX12 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8 %s
 
@@ -22,6 +25,20 @@ define float @lds_atomic_fadd_ret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: lds_atomic_fadd_ret_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_add_rtn_f32 v0, v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX7-LABEL: lds_atomic_fadd_ret_f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -87,6 +104,20 @@ define void @lds_atomic_fadd_noret_f32(ptr addrspace(3) %ptr) nounwind {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: lds_atomic_fadd_noret_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_add_f32 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX7-LABEL: lds_atomic_fadd_noret_f32:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -215,84 +246,329 @@ define amdgpu_kernel void @lds_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3) %
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: lds_ds_fadd:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s3, s3, 4
-; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB2_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_lshl_b32 s8, s3, 3
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB2_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB2_4
-; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    ds_add_f32 v2, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB2_4:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v0
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    ; implicit-def: $vgpr0
-; GFX9-NEXT:  .LBB2_5: ; %ComputeLoop
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b64 s3, s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    v_readlane_b32 s9, v2, s3
-; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    v_add_f32_e32 v1, s9, v1
-; GFX9-NEXT:    s_cbranch_scc1 .LBB2_5
-; GFX9-NEXT:  ; %bb.6: ; %ComputeEnd
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    ; implicit-def: $vgpr2
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB2_8
-; GFX9-NEXT:  ; %bb.7:
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    ds_add_rtn_f32 v2, v2, v1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:  .LBB2_8:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; GFX908-LABEL: lds_ds_fadd:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_add_i32 s3, s3, 4
+; GFX908-NEXT:    ; implicit-def: $vgpr1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB2_2
+; GFX908-NEXT:  ; %bb.1:
+; GFX908-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX908-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX908-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:  .LBB2_2:
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB2_4
+; GFX908-NEXT:  ; %bb.3:
+; GFX908-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX908-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX908-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    ds_add_f32 v2, v1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:  .LBB2_4:
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_add_f32_e32 v2, s8, v0
+; GFX908-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX908-NEXT:    ; implicit-def: $vgpr0
+; GFX908-NEXT:  .LBB2_5: ; %ComputeLoop
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_ff1_i32_b64 s3, s[4:5]
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
+; GFX908-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB2_8
+; GFX908-NEXT:  ; %bb.7:
+; GFX908-NEXT:    v_mov_b32_e32 v2, s2
+; GFX908-NEXT:    ds_add_rtn_f32 v2, v2, v1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:  .LBB2_8:
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: lds_ds_fadd:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX90A-NEXT:    s_mov_b64 s[4:5], exec
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_add_i32 s3, s3, 4
+; GFX90A-NEXT:    ; implicit-def: $vgpr1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB2_2
+; GFX90A-NEXT:  ; %bb.1:
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB2_2:
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB2_4
+; GFX90A-NEXT:  ; %bb.3:
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX90A-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s3
+; GFX90A-NEXT:    ds_add_f32 v2, v1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB2_4:
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], exec
+; GFX90A-NEXT:    v_add_f32_e32 v2, s8, v0
+; GFX90A-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:  .LBB2_5: ; %ComputeLoop
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[4:5]
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
+; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB2_8
+; GFX90A-NEXT:  ; %bb.7:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    ds_add_rtn_f32 v2, v2, v1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:  .LBB2_8:
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX940-LABEL: lds_ds_fadd:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_add_i32 s3, s3, 4
+; GFX940-NEXT:    ; implicit-def: $vgpr1
+; GFX940-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB2_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX940-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX940-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX940-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:  .LBB2_2:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_mov_b64 s[6:7], exec
+; GFX940-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB2_4
+; GFX940-NEXT:  ; %bb.3:
+; GFX940-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX940-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX940-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX940-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NEXT:    ds_add_f32 v2, v1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:  .LBB2_4:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX940-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_add_f32_e32 v2, s8, v0
+; GFX940-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX940-NEXT:    ; implicit-def: $vgpr0
+; GFX940-NEXT:  .LBB2_5: ; %ComputeLoop
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_ff1_i32_b64 s3, s[4:5]
+; GFX940-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX940-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX940-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX940-NEXT:    s_mov_b32 m0, s3
+; GFX940-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX940-NEXT:    v_writelane_b32 v0, s8, m0
+; GFX940-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX940-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX940-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX940-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX940-NEXT:    ; implicit-def: $vgpr2
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execz .LBB2_8
+; GFX940-NEXT:  ; %bb.7:
+; GFX940-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NEXT:    ds_add_rtn_f32 v2, v2, v1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:  .LBB2_8:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: lds_ds_fadd:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
+; GFX12-NEXT:    s_mov_b32 s5, exec_lo
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s3, s3, 4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB2_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s5
+; GFX12-NEXT:    s_lshl_b32 s5, s3, 3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
+; GFX12-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:  .LBB2_2:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_mov_b32 s6, exec_lo
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX12-NEXT:    s_cbranch_execz .LBB2_4
+; GFX12-NEXT:  ; %bb.3:
+; GFX12-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1
+; GFX12-NEXT:    ds_add_f32 v2, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:  .LBB2_4:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    s_brev_b32 s3, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX12-NEXT:    v_add_f32_e32 v1, s5, v0
+; GFX12-NEXT:    ; implicit-def: $vgpr0
+; GFX12-NEXT:  .LBB2_5: ; %ComputeLoop
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX12-NEXT:    s_lshl_b32 s7, 1, s5
+; GFX12-NEXT:    v_writelane_b32 v0, s3, s5
+; GFX12-NEXT:    s_and_not1_b32 s4, s4, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX12-NEXT:    s_add_f32 s3, s3, s6
+; GFX12-NEXT:    s_cbranch_scc1 .LBB2_5
+; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX12-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execz .LBB2_8
+; GFX12-NEXT:  ; %bb.7:
+; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT:    ds_add_rtn_f32 v1, v1, v2
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:  .LBB2_8:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: lds_ds_fadd:
 ; GFX7:       ; %bb.0:
@@ -566,82 +842,319 @@ define amdgpu_kernel void @lds_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrspa
 ; VI-NEXT:    flat_store_dword v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: lds_ds_fadd_one_as:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
-; GFX9-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_i32 s3, s3, 4
-; GFX9-NEXT:    ; implicit-def: $vgpr1
-; GFX9-NEXT:    s_and_saveexec_b64 s[6:7], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB3_2
-; GFX9-NEXT:  ; %bb.1:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
-; GFX9-NEXT:    s_lshl_b32 s8, s3, 3
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s8
-; GFX9-NEXT:    ds_add_rtn_f32 v1, v2, v1
-; GFX9-NEXT:  .LBB3_2:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT:    s_mov_b64 s[6:7], exec
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_cbranch_execz .LBB3_4
-; GFX9-NEXT:  ; %bb.3:
-; GFX9-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 4
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
-; GFX9-NEXT:    v_mov_b32_e32 v2, s3
-; GFX9-NEXT:    ds_add_f32 v2, v1
-; GFX9-NEXT:  .LBB3_4:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], exec
-; GFX9-NEXT:    v_add_f32_e32 v2, s8, v0
-; GFX9-NEXT:    v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT:    ; implicit-def: $vgpr0
-; GFX9-NEXT:  .LBB3_5: ; %ComputeLoop
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_ff1_i32_b64 s3, s[4:5]
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX9-NEXT:    v_readlane_b32 s9, v2, s3
-; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
-; GFX9-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    v_add_f32_e32 v1, s9, v1
-; GFX9-NEXT:    s_cbranch_scc1 .LBB3_5
-; GFX9-NEXT:  ; %bb.6: ; %ComputeEnd
-; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
-; GFX9-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
-; GFX9-NEXT:    ; implicit-def: $vgpr2
-; GFX9-NEXT:    s_and_saveexec_b64 s[4:5], vcc
-; GFX9-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execz .LBB3_8
-; GFX9-NEXT:  ; %bb.7:
-; GFX9-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-NEXT:    ds_add_rtn_f32 v2, v2, v1
-; GFX9-NEXT:  .LBB3_8:
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_add_f32_e32 v0, s2, v0
-; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9-NEXT:    s_endpgm
+; GFX908-LABEL: lds_ds_fadd_one_as:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    s_add_i32 s3, s3, 4
+; GFX908-NEXT:    ; implicit-def: $vgpr1
+; GFX908-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB3_2
+; GFX908-NEXT:  ; %bb.1:
+; GFX908-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX908-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX908-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, s8
+; GFX908-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX908-NEXT:  .LBB3_2:
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_mov_b64 s[6:7], exec
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_cbranch_execz .LBB3_4
+; GFX908-NEXT:  ; %bb.3:
+; GFX908-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX908-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX908-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX908-NEXT:    v_mov_b32_e32 v2, s3
+; GFX908-NEXT:    ds_add_f32 v2, v1
+; GFX908-NEXT:  .LBB3_4:
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX908-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], exec
+; GFX908-NEXT:    v_add_f32_e32 v2, s8, v0
+; GFX908-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX908-NEXT:    ; implicit-def: $vgpr0
+; GFX908-NEXT:  .LBB3_5: ; %ComputeLoop
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_ff1_i32_b64 s3, s[4:5]
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
+; GFX908-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX908-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX908-NEXT:    ; implicit-def: $vgpr2
+; GFX908-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX908-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execz .LBB3_8
+; GFX908-NEXT:  ; %bb.7:
+; GFX908-NEXT:    v_mov_b32_e32 v2, s2
+; GFX908-NEXT:    ds_add_rtn_f32 v2, v2, v1
+; GFX908-NEXT:  .LBB3_8:
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX908-NEXT:    v_mov_b32_e32 v1, 0
+; GFX908-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX908-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: lds_ds_fadd_one_as:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX90A-NEXT:    s_mov_b64 s[4:5], exec
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_add_i32 s3, s3, 4
+; GFX90A-NEXT:    ; implicit-def: $vgpr1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB3_2
+; GFX90A-NEXT:  ; %bb.1:
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX90A-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s8
+; GFX90A-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX90A-NEXT:  .LBB3_2:
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_mov_b64 s[6:7], exec
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_cbranch_execz .LBB3_4
+; GFX90A-NEXT:  ; %bb.3:
+; GFX90A-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX90A-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX90A-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s3
+; GFX90A-NEXT:    ds_add_f32 v2, v1
+; GFX90A-NEXT:  .LBB3_4:
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX90A-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], exec
+; GFX90A-NEXT:    v_add_f32_e32 v2, s8, v0
+; GFX90A-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX90A-NEXT:    ; implicit-def: $vgpr0
+; GFX90A-NEXT:  .LBB3_5: ; %ComputeLoop
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[4:5]
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
+; GFX90A-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX90A-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX90A-NEXT:    ; implicit-def: $vgpr2
+; GFX90A-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execz .LBB3_8
+; GFX90A-NEXT:  ; %bb.7:
+; GFX90A-NEXT:    v_mov_b32_e32 v2, s2
+; GFX90A-NEXT:    ds_add_rtn_f32 v2, v2, v1
+; GFX90A-NEXT:  .LBB3_8:
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 0
+; GFX90A-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX90A-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX940-LABEL: lds_ds_fadd_one_as:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x2c
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v0, s4, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v0, s5, v0
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_add_i32 s3, s3, 4
+; GFX940-NEXT:    ; implicit-def: $vgpr1
+; GFX940-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB3_2
+; GFX940-NEXT:  ; %bb.1:
+; GFX940-NEXT:    s_bcnt1_i32_b64 s4, s[4:5]
+; GFX940-NEXT:    s_lshl_b32 s8, s3, 3
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s4
+; GFX940-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX940-NEXT:    v_mov_b32_e32 v2, s8
+; GFX940-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX940-NEXT:  .LBB3_2:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX940-NEXT:    s_mov_b64 s[6:7], exec
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v1, s6, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v1, s7, v1
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_cbranch_execz .LBB3_4
+; GFX940-NEXT:  ; %bb.3:
+; GFX940-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX940-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX940-NEXT:    v_mul_f32_e32 v1, 0x42280000, v1
+; GFX940-NEXT:    v_mov_b32_e32 v2, s3
+; GFX940-NEXT:    ds_add_f32 v2, v1
+; GFX940-NEXT:  .LBB3_4:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX940-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX940-NEXT:    s_mov_b64 s[4:5], exec
+; GFX940-NEXT:    v_add_f32_e32 v2, s8, v0
+; GFX940-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX940-NEXT:    ; implicit-def: $vgpr0
+; GFX940-NEXT:  .LBB3_5: ; %ComputeLoop
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_ff1_i32_b64 s3, s[4:5]
+; GFX940-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX940-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX940-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX940-NEXT:    s_mov_b32 m0, s3
+; GFX940-NEXT:    s_andn2_b64 s[4:5], s[4:5], s[6:7]
+; GFX940-NEXT:    v_writelane_b32 v0, s8, m0
+; GFX940-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX940-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX940-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX940-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX940-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX940-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX940-NEXT:    ; implicit-def: $vgpr2
+; GFX940-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX940-NEXT:    s_cbranch_execz .LBB3_8
+; GFX940-NEXT:  ; %bb.7:
+; GFX940-NEXT:    v_mov_b32_e32 v2, s2
+; GFX940-NEXT:    ds_add_rtn_f32 v2, v2, v1
+; GFX940-NEXT:  .LBB3_8:
+; GFX940-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX940-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX940-NEXT:    v_mov_b32_e32 v1, 0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX940-NEXT:    global_store_dword v1, v0, s[0:1] sc0 sc1
+; GFX940-NEXT:    s_endpgm
+;
+; GFX12-LABEL: lds_ds_fadd_one_as:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[2:3], s[0:1], 0x2c
+; GFX12-NEXT:    s_mov_b32 s5, exec_lo
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v0, s5, 0
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_add_co_i32 s3, s3, 4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v0
+; GFX12-NEXT:    s_cbranch_execz .LBB3_2
+; GFX12-NEXT:  ; %bb.1:
+; GFX12-NEXT:    s_bcnt1_i32_b32 s5, s5
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s5
+; GFX12-NEXT:    s_lshl_b32 s5, s3, 3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s5 :: v_dual_mul_f32 v1, 0x42280000, v1
+; GFX12-NEXT:    ds_add_rtn_f32 v1, v2, v1
+; GFX12-NEXT:  .LBB3_2:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    s_mov_b32 s6, exec_lo
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v2, s6, 0
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    v_cmpx_eq_u32_e32 0, v2
+; GFX12-NEXT:    s_cbranch_execz .LBB3_4
+; GFX12-NEXT:  ; %bb.3:
+; GFX12-NEXT:    s_bcnt1_i32_b32 s6, s6
+; GFX12-NEXT:    s_lshl_b32 s3, s3, 4
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v1, s6
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s3 :: v_dual_mul_f32 v1, 0x42280000, v1
+; GFX12-NEXT:    ds_add_f32 v2, v1
+; GFX12-NEXT:  .LBB3_4:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    v_cvt_f32_ubyte0_e32 v0, v0
+; GFX12-NEXT:    s_mov_b32 s4, exec_lo
+; GFX12-NEXT:    s_brev_b32 s3, 1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mul_f32_e32 v0, 0x42280000, v0
+; GFX12-NEXT:    v_add_f32_e32 v1, s5, v0
+; GFX12-NEXT:    ; implicit-def: $vgpr0
+; GFX12-NEXT:  .LBB3_5: ; %ComputeLoop
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_ctz_i32_b32 s5, s4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_readlane_b32 s6, v1, s5
+; GFX12-NEXT:    s_lshl_b32 s7, 1, s5
+; GFX12-NEXT:    v_writelane_b32 v0, s3, s5
+; GFX12-NEXT:    s_and_not1_b32 s4, s4, s7
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX12-NEXT:    s_add_f32 s3, s3, s6
+; GFX12-NEXT:    s_cbranch_scc1 .LBB3_5
+; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
+; GFX12-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX12-NEXT:    ; implicit-def: $vgpr1
+; GFX12-NEXT:    s_and_saveexec_b32 s4, vcc_lo
+; GFX12-NEXT:    s_xor_b32 s4, exec_lo, s4
+; GFX12-NEXT:    s_cbranch_execz .LBB3_8
+; GFX12-NEXT:  ; %bb.7:
+; GFX12-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX12-NEXT:    ds_add_rtn_f32 v1, v1, v2
+; GFX12-NEXT:  .LBB3_8:
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s4
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x24
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_add_f32 v0, s2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 ;
 ; GFX7-LABEL: lds_ds_fadd_one_as:
 ; GFX7:       ; %bb.0:
@@ -858,27 +1371,73 @@ define double @lds_atomic_fadd_ret_f64(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_ret_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v2, v0
-; GFX9-NEXT:    ds_read_b64 v[0:1], v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB4_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, v0
-; GFX9-NEXT:    v_add_f64 v[0:1], v[3:4], 4.0
-; GFX9-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB4_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_ret_f64:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v2, v0
+; GFX908-NEXT:    ds_read_b64 v[0:1], v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v1
+; GFX908-NEXT:    v_mov_b32_e32 v3, v0
+; GFX908-NEXT:    v_add_f64 v[0:1], v[3:4], 4.0
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[3:4], v[0:1]
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[3:4]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_ret_f64:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_ret_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    ds_add_rtn_f64 v[0:1], v0, v[2:3]
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_ret_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v2, v0
+; GFX12-NEXT:    ds_load_b64 v[0:1], v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB4_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v3, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e32 v[0:1], 4.0, v[3:4]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[0:1], v2, v[0:1], v[3:4]
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[3:4]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB4_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_f64:
 ; GFX7:       ; %bb.0:
@@ -952,26 +1511,70 @@ define void @lds_atomic_fadd_noret_f64(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_noret_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b64 v[1:2], v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB5_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
-; GFX9-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
-; GFX9-NEXT:    v_mov_b32_e32 v1, v3
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v4
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB5_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_noret_f64:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b64 v[1:2], v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_add_f64 v[3:4], v[1:2], 4.0
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[1:2], v[3:4]
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[1:2]
+; GFX908-NEXT:    v_mov_b32_e32 v1, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_noret_f64:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v2, 0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT:    ds_add_f64 v0, v[2:3]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_noret_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[2:3], 4.0
+; GFX940-NEXT:    ds_add_f64 v0, v[2:3]
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_noret_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b64 v[1:2], v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB5_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_add_f64_e32 v[3:4], 4.0, v[1:2]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[1:2]
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[1:2]
+; GFX12-NEXT:    v_dual_mov_b32 v1, v3 :: v_dual_mov_b32 v2, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB5_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_noret_f64:
 ; GFX7:       ; %bb.0:
@@ -1043,26 +1646,97 @@ define float @lds_atomic_fsub_ret_f32(ptr addrspace(3) %ptr, float %val) nounwin
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fsub_ret_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b32 v2, v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB6_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-NEXT:    v_sub_f32_e32 v2, v3, v1
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB6_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fsub_ret_f32:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_sub_f32_e32 v2, v3, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fsub_ret_f32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_sub_f32_e32 v2, v3, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fsub_ret_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_sub_f32_e32 v2, v3, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fsub_ret_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB6_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_sub_f32_e32 v2, v3, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB6_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_ret_f32:
 ; GFX7:       ; %bb.0:
@@ -1133,25 +1807,92 @@ define void @lds_atomic_fsub_noret_f32(ptr addrspace(3) %ptr, float %val) nounwi
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fsub_noret_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b32 v2, v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB7_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_sub_f32_e32 v3, v2, v1
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB7_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fsub_noret_f32:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_sub_f32_e32 v3, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fsub_noret_f32:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_sub_f32_e32 v3, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fsub_noret_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_sub_f32_e32 v3, v2, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fsub_noret_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB7_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_sub_f32_e32 v3, v2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB7_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_noret_f32:
 ; GFX7:       ; %bb.0:
@@ -1223,28 +1964,103 @@ define double @lds_atomic_fsub_ret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; VI-NEXT:    v_mov_b32_e32 v1, v4
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fsub_ret_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b64 v[3:4], v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB8_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v6, v4
-; GFX9-NEXT:    v_mov_b32_e32 v5, v3
-; GFX9-NEXT:    v_add_f64 v[3:4], v[5:6], -v[1:2]
-; GFX9-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB8_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v3
-; GFX9-NEXT:    v_mov_b32_e32 v1, v4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fsub_ret_f64:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b64 v[3:4], v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v6, v4
+; GFX908-NEXT:    v_mov_b32_e32 v5, v3
+; GFX908-NEXT:    v_add_f64 v[3:4], v[5:6], -v[1:2]
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[3:4], v0, v[5:6], v[3:4]
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[5:6]
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v3
+; GFX908-NEXT:    v_mov_b32_e32 v1, v4
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fsub_ret_f64:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v5, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v0
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v1
+; GFX90A-NEXT:    ds_read_b64 v[0:1], v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT:    v_add_f64 v[0:1], v[6:7], -v[4:5]
+; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[6:7], v[0:1]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fsub_ret_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v5, v2
+; GFX940-NEXT:    v_mov_b32_e32 v2, v0
+; GFX940-NEXT:    v_mov_b32_e32 v4, v1
+; GFX940-NEXT:    ds_read_b64 v[0:1], v0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX940-NEXT:    v_add_f64 v[0:1], v[6:7], -v[4:5]
+; GFX940-NEXT:    ds_cmpst_rtn_b64 v[0:1], v2, v[6:7], v[0:1]
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fsub_ret_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b64 v[3:4], v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB8_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v6, v4 :: v_dual_mov_b32 v5, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f64_e64 v[3:4], v[5:6], -v[1:2]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[3:4], v0, v[3:4], v[5:6]
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[3:4], v[5:6]
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB8_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_ret_f64:
 ; GFX7:       ; %bb.0:
@@ -1320,26 +2136,97 @@ define void @lds_atomic_fsub_noret_f64(ptr addrspace(3) %ptr, double %val) nounw
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fsub_noret_f64:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b64 v[3:4], v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB9_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add_f64 v[5:6], v[3:4], -v[1:2]
-; GFX9-NEXT:    ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6]
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
-; GFX9-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v4, v6
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB9_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fsub_noret_f64:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b64 v[3:4], v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_add_f64 v[5:6], v[3:4], -v[1:2]
+; GFX908-NEXT:    ds_cmpst_rtn_b64 v[5:6], v0, v[3:4], v[5:6]
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v5
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v4, v6
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fsub_noret_f64:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ds_read_b64 v[4:5], v0
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v1
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_add_f64 v[6:7], v[4:5], -v[2:3]
+; GFX90A-NEXT:    ds_cmpst_rtn_b64 v[6:7], v0, v[4:5], v[6:7]
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_pk_mov_b32 v[4:5], v[6:7], v[6:7] op_sel:[0,1]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fsub_noret_f64:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b64 v[4:5], v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_mov_b32_e32 v2, v1
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_add_f64 v[6:7], v[4:5], -v[2:3]
+; GFX940-NEXT:    ds_cmpst_rtn_b64 v[6:7], v0, v[4:5], v[6:7]
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fsub_noret_f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b64 v[3:4], v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB9_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_add_f64_e64 v[5:6], v[3:4], -v[1:2]
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b64 v[5:6], v0, v[5:6], v[3:4]
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u64_e32 vcc_lo, v[5:6], v[3:4]
+; GFX12-NEXT:    v_dual_mov_b32 v3, v5 :: v_dual_mov_b32 v4, v6
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB9_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fsub_noret_f64:
 ; GFX7:       ; %bb.0:
@@ -1427,41 +2314,164 @@ define bfloat @lds_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_ret_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, -4, v0
-; GFX9-NEXT:    ds_read_b32 v3, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX9-NEXT:  .LBB10_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, v3
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_add_f32_e32 v3, 4.0, v3
-; GFX9-NEXT:    v_bfe_u32 v5, v3, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v3
-; GFX9-NEXT:    v_add3_u32 v5, v5, v3, s6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_or_b32 v3, v4, v2, v3
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB10_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_ret_bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX908-NEXT:    ds_read_b32 v3, v1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX908-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; GFX908-NEXT:    v_not_b32_e32 v2, v2
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v3
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX908-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX908-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX908-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_ret_bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX90A-NEXT:    ds_read_b32 v3, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; GFX90A-NEXT:    v_not_b32_e32 v2, v2
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v3
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX90A-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v3, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX90A-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_ret_bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX940-NEXT:    ds_read_b32 v3, v1
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX940-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX940-NEXT:    v_not_b32_e32 v2, v2
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v4, v3
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v3, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX940-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX940-NEXT:    v_add3_u32 v5, v5, v3, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v3, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v3, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v1, v4, v3
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_ret_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    ds_load_b32 v3, v1
+; GFX12-NEXT:    v_lshlrev_b32_e64 v2, v0, 0xffff
+; GFX12-NEXT:    v_and_b32_e32 v0, 24, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_not_b32_e32 v2, v2
+; GFX12-NEXT:  .LBB10_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, v0, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v3, 4.0, v3
+; GFX12-NEXT:    v_bfe_u32 v5, v3, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v3
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v3, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v5, v5, v3, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v3, v5, v6, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, v0, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v3, v4, v2, v3
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v1, v3, v4
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB10_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, v0, v3
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_bf16:
 ; GFX7:       ; %bb.0:
@@ -1572,40 +2582,160 @@ define void @lds_atomic_fadd_noret_bf16(ptr addrspace(3) %ptr) nounwind {
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_noret_bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v1, -4, v0
-; GFX9-NEXT:    ds_read_b32 v3, v1
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0xffff
-; GFX9-NEXT:    v_and_b32_e32 v0, 24, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
-; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:    s_movk_i32 s6, 0x7fff
-; GFX9-NEXT:  .LBB11_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_add_f32_e32 v4, 4.0, v4
-; GFX9-NEXT:    v_bfe_u32 v5, v4, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v6, 0x400000, v4
-; GFX9-NEXT:    v_add3_u32 v5, v5, v4, s6
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_and_or_b32 v4, v3, v2, v4
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB11_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_noret_bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX908-NEXT:    ds_read_b32 v3, v1
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX908-NEXT:    s_mov_b32 s4, 0xffff
+; GFX908-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX908-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; GFX908-NEXT:    v_not_b32_e32 v2, v2
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX908-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX908-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX908-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX908-NEXT:    v_add3_u32 v5, v5, v4, s6
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX908-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX908-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_noret_bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX90A-NEXT:    ds_read_b32 v3, v1
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX90A-NEXT:    s_mov_b32 s4, 0xffff
+; GFX90A-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e64 v2, v2, s4
+; GFX90A-NEXT:    v_not_b32_e32 v2, v2
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX90A-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX90A-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX90A-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX90A-NEXT:    v_add3_u32 v5, v5, v4, s6
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX90A-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX90A-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_noret_bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX940-NEXT:    ds_read_b32 v3, v1
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX940-NEXT:    s_mov_b32 s0, 0xffff
+; GFX940-NEXT:    v_and_b32_e32 v0, 24, v2
+; GFX940-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
+; GFX940-NEXT:    v_not_b32_e32 v2, v2
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:    s_movk_i32 s2, 0x7fff
+; GFX940-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_lshrrev_b32_sdwa v4, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX940-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX940-NEXT:    v_add3_u32 v5, v5, v4, s2
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v4, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc
+; GFX940-NEXT:    v_lshlrev_b32_sdwa v4, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX940-NEXT:    v_and_or_b32 v4, v3, v2, v4
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v1, v3, v4
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_noret_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v1, -4, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:    ds_load_b32 v2, v1
+; GFX12-NEXT:    v_lshlrev_b32_e64 v3, v0, 0xffff
+; GFX12-NEXT:    v_and_b32_e32 v0, 24, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_not_b32_e32 v3, v3
+; GFX12-NEXT:  .LBB11_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, v0, v2
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add_f32_e32 v4, 4.0, v4
+; GFX12-NEXT:    v_bfe_u32 v5, v4, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v6, 0x400000, v4
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v5, v5, v4, 0x7fff
+; GFX12-NEXT:    v_cndmask_b32_e32 v4, v5, v6, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
+; GFX12-NEXT:    v_lshlrev_b32_e32 v4, v0, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_and_or_b32 v4, v2, v3, v4
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v1, v4, v2
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v2
+; GFX12-NEXT:    v_mov_b32_e32 v2, v4
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB11_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_noret_bf16:
 ; GFX7:       ; %bb.0:
@@ -1692,6 +2822,20 @@ define float @lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode(ptr addrspace
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_add_rtn_f32 v0, v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX7-LABEL: lds_atomic_fadd_ret_f32__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1757,6 +2901,20 @@ define void @lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode(ptr addrspac
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX12-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 4.0
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_add_f32 v0, v1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX7-LABEL: lds_atomic_fadd_noret_f32__amdgpu_ignore_denormal_mode:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1827,26 +2985,97 @@ define <2 x half> @lds_atomic_fadd_ret_v2f16(ptr addrspace(3) %ptr, <2 x half> %
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_ret_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b32 v2, v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB14_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v3, v2
-; GFX9-NEXT:    v_pk_add_f16 v2, v3, v1
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB14_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_ret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v3, v2
+; GFX908-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_ret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v2
+; GFX90A-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_ret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v3, v2
+; GFX940-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v3, v2
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_ret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v3, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_pk_add_f16 v2, v3, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v3
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_v2f16:
 ; GFX7:       ; %bb.0:
@@ -1959,25 +3188,92 @@ define void @lds_atomic_fadd_noret_v2f16(ptr addrspace(3) %ptr, <2 x half> %val)
 ; VI-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_noret_v2f16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b32 v2, v0
-; GFX9-NEXT:    s_mov_b64 s[4:5], 0
-; GFX9-NEXT:  .LBB15_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_pk_add_f16 v3, v2, v1
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
-; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v2, v3
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_cbranch_execnz .LBB15_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_noret_v2f16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    s_mov_b64 s[4:5], 0
+; GFX908-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX908-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX908-NEXT:    v_mov_b32_e32 v2, v3
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_noret_v2f16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    s_mov_b64 s[4:5], 0
+; GFX90A-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX90A-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT:    v_mov_b32_e32 v2, v3
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_noret_v2f16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    s_mov_b64 s[0:1], 0
+; GFX940-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v3, v0, v2, v3
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; GFX940-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX940-NEXT:    v_mov_b32_e32 v2, v3
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_noret_v2f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    s_mov_b32 s0, 0
+; GFX12-NEXT:  .LBB15_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_pk_add_f16 v3, v2, v1
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v3, v0, v3, v2
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v3, v2
+; GFX12-NEXT:    v_mov_b32_e32 v2, v3
+; GFX12-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_cbranch_execnz .LBB15_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_noret_v2f16:
 ; GFX7:       ; %bb.0:
@@ -2106,44 +3402,171 @@ define <2 x bfloat> @lds_atomic_fadd_ret_v2bf16(ptr addrspace(3) %ptr, <2 x bflo
 ; VI-NEXT:    v_mov_b32_e32 v0, v2
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_ret_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b32 v2, v0
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
-; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX9-NEXT:  .LBB16_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mov_b32_e32 v4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
-; GFX9-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX9-NEXT:    v_bfe_u32 v6, v2, 16, 1
-; GFX9-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v2
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX9-NEXT:    v_add3_u32 v6, v6, v2, s8
-; GFX9-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v2, v5, v2, s9
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX9-NEXT:    s_cbranch_execnz .LBB16_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v2
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_ret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b32 v2, v0
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_mov_b32_e32 v4, v2
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX908-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX908-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v2, v5, v2, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v0, v2
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_ret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ds_read_b32 v2, v0
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_mov_b32_e32 v4, v2
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX90A-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v2, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v2, v2
+; GFX90A-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v2, v5, v2, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v0, v2
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_ret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v2, v0
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_mov_b32_e32 v4, v2
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v4
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX940-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v2
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v2, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v2, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v2, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v2, v5, v2, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v2, v0, v4, v2
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_ret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b32 v2, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB16_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v4
+; GFX12-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_cndmask_b32 v5, v7, v9 :: v_dual_lshlrev_b32 v2, 16, v4
+; GFX12-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX12-NEXT:    v_bfe_u32 v6, v2, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v2
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v2, v2
+; GFX12-NEXT:    v_add3_u32 v6, v6, v2, 0x7fff
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v2, v6, v8, s0
+; GFX12-NEXT:    v_perm_b32 v2, v5, v2, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v2, v0, v2, v4
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v2, v4
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB16_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_ret_v2bf16:
 ; GFX7:       ; %bb.0:
@@ -2267,43 +3690,166 @@ define void @lds_atomic_fadd_noret_v2bf16(ptr addrspace(3) %ptr, <2 x bfloat> %v
 ; VI-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: lds_atomic_fadd_noret_v2bf16:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    ds_read_b32 v3, v0
-; GFX9-NEXT:    s_mov_b64 s[6:7], 0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
-; GFX9-NEXT:    s_movk_i32 s8, 0x7fff
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
-; GFX9-NEXT:    s_mov_b32 s9, 0x7060302
-; GFX9-NEXT:  .LBB17_1: ; %atomicrmw.start
-; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
-; GFX9-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
-; GFX9-NEXT:    v_add_f32_e32 v4, v4, v2
-; GFX9-NEXT:    v_add_f32_e32 v5, v5, v1
-; GFX9-NEXT:    v_bfe_u32 v6, v4, 16, 1
-; GFX9-NEXT:    v_bfe_u32 v8, v5, 16, 1
-; GFX9-NEXT:    v_or_b32_e32 v7, 0x400000, v4
-; GFX9-NEXT:    v_or_b32_e32 v9, 0x400000, v5
-; GFX9-NEXT:    v_add3_u32 v6, v6, v4, s8
-; GFX9-NEXT:    v_add3_u32 v8, v8, v5, s8
-; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
-; GFX9-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s9
-; GFX9-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
-; GFX9-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX9-NEXT:    v_mov_b32_e32 v3, v4
-; GFX9-NEXT:    s_andn2_b64 exec, exec, s[6:7]
-; GFX9-NEXT:    s_cbranch_execnz .LBB17_1
-; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
-; GFX9-NEXT:    s_or_b64 exec, exec, s[6:7]
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX908-LABEL: lds_atomic_fadd_noret_v2bf16:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    ds_read_b32 v3, v0
+; GFX908-NEXT:    s_mov_b64 s[6:7], 0
+; GFX908-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX908-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX908-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX908-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX908-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX908-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX908-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX908-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX908-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX908-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX908-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX908-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX908-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX908-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX908-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX908-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX908-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX908-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX908-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX908-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX908-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX908-NEXT:    v_mov_b32_e32 v3, v4
+; GFX908-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX908-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90A-LABEL: lds_atomic_fadd_noret_v2bf16:
+; GFX90A:       ; %bb.0:
+; GFX90A-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT:    ds_read_b32 v3, v0
+; GFX90A-NEXT:    s_mov_b64 s[6:7], 0
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX90A-NEXT:    s_movk_i32 s8, 0x7fff
+; GFX90A-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX90A-NEXT:    s_mov_b32 s9, 0x7060302
+; GFX90A-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX90A-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX90A-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX90A-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX90A-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX90A-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX90A-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX90A-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX90A-NEXT:    v_add3_u32 v6, v6, v4, s8
+; GFX90A-NEXT:    v_add3_u32 v8, v8, v5, s8
+; GFX90A-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX90A-NEXT:    v_cmp_u_f32_e64 s[4:5], v4, v4
+; GFX90A-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[4:5]
+; GFX90A-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX90A-NEXT:    v_perm_b32 v4, v5, v4, s9
+; GFX90A-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX90A-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
+; GFX90A-NEXT:    v_mov_b32_e32 v3, v4
+; GFX90A-NEXT:    s_andn2_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX90A-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX90A-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX940-LABEL: lds_atomic_fadd_noret_v2bf16:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    ds_read_b32 v3, v0
+; GFX940-NEXT:    s_mov_b64 s[2:3], 0
+; GFX940-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX940-NEXT:    s_movk_i32 s4, 0x7fff
+; GFX940-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX940-NEXT:    s_mov_b32 s5, 0x7060302
+; GFX940-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX940-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_lshlrev_b32_e32 v4, 16, v3
+; GFX940-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX940-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX940-NEXT:    v_add_f32_e32 v5, v5, v1
+; GFX940-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX940-NEXT:    v_bfe_u32 v8, v5, 16, 1
+; GFX940-NEXT:    v_or_b32_e32 v7, 0x400000, v4
+; GFX940-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX940-NEXT:    v_add3_u32 v6, v6, v4, s4
+; GFX940-NEXT:    v_add3_u32 v8, v8, v5, s4
+; GFX940-NEXT:    v_cmp_u_f32_e32 vcc, v5, v5
+; GFX940-NEXT:    v_cmp_u_f32_e64 s[0:1], v4, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v5, v8, v9, vcc
+; GFX940-NEXT:    v_cndmask_b32_e64 v4, v6, v7, s[0:1]
+; GFX940-NEXT:    v_perm_b32 v4, v5, v4, s5
+; GFX940-NEXT:    ds_cmpst_rtn_b32 v4, v0, v3, v4
+; GFX940-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX940-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX940-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX940-NEXT:    v_mov_b32_e32 v3, v4
+; GFX940-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX940-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: lds_atomic_fadd_noret_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    ds_load_b32 v3, v0
+; GFX12-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
+; GFX12-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
+; GFX12-NEXT:    s_mov_b32 s1, 0
+; GFX12-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff0000, v3
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_add_f32 v5, v5, v1 :: v_dual_lshlrev_b32 v4, 16, v3
+; GFX12-NEXT:    v_add_f32_e32 v4, v4, v2
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v7, v5, 16, 1
+; GFX12-NEXT:    v_bfe_u32 v6, v4, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v8, 0x400000, v4
+; GFX12-NEXT:    v_or_b32_e32 v9, 0x400000, v5
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v5, v5
+; GFX12-NEXT:    v_add3_u32 v7, v7, v5, 0x7fff
+; GFX12-NEXT:    v_add3_u32 v6, v6, v4, 0x7fff
+; GFX12-NEXT:    v_cmp_u_f32_e64 s0, v4, v4
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_cndmask_b32_e32 v5, v7, v9, vcc_lo
+; GFX12-NEXT:    v_cndmask_b32_e64 v4, v6, v8, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_perm_b32 v4, v5, v4, 0x7060302
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    ds_cmpstore_rtn_b32 v4, v0, v4, v3
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, v4, v3
+; GFX12-NEXT:    v_mov_b32_e32 v3, v4
+; GFX12-NEXT:    s_or_b32 s1, vcc_lo, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_and_not1_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX12-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s1
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX7-LABEL: lds_atomic_fadd_noret_v2bf16:
 ; GFX7:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index e7c6044b3fb6b0..fb40274cac1bac 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
 
 define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
 ; GFX12-LABEL: flat_last_use_load_0:
@@ -107,6 +107,3 @@ entry:
 
 !0 = !{i32 1}
 declare i32 @llvm.amdgcn.workitem.id.x()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-CU: {{.*}}
-; GFX12-WGP: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index c889c67a5ca37d..7a9cb992a0cd16 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
 
 define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: global_last_use_load_0:
@@ -92,6 +92,3 @@ entry:
 }
 !0 = !{i32 1}
 declare i32 @llvm.amdgcn.workitem.id.x()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-CU: {{.*}}
-; GFX12-WGP: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
index 1f835349b12b0b..61cec731feb565 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -O0 -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefix=GFX12 %s
 
 define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) {
 ; GFX12-LABEL: private_last_use_load_0:
@@ -85,6 +85,3 @@ entry:
 
 !0 = !{i32 1}
 declare i32 @llvm.amdgcn.workitem.id.x()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12-CU: {{.*}}
-; GFX12-WGP: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/apple-version-min.ll b/llvm/test/CodeGen/ARM/apple-version-min.ll
new file mode 100644
index 00000000000000..180c19e5e36230
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/apple-version-min.ll
@@ -0,0 +1,12 @@
+; Test emitting version_min directives.
+
+; Let's not split this into separate ARM/AArch64 parts.
+; REQUIRES: aarch64-registered-target
+
+; RUN: llc %s -filetype=asm -o - --mtriple arm64-apple-tvos9.0.0 | FileCheck %s --check-prefix=TVOS
+; RUN: llc %s -filetype=asm -o - --mtriple thumbv7s-apple-ios7.0.0 | FileCheck %s --check-prefix=IOS
+; RUN: llc %s -filetype=asm -o - --mtriple thumbv7k-apple-watchos2.0.0 | FileCheck %s --check-prefix=WATCHOS
+
+; TVOS: .tvos_version_min 9, 0
+; IOS: .ios_version_min 7, 0
+; WATCHOS: .watchos_version_min 2, 0
diff --git a/llvm/test/CodeGen/ARM/fp-intrinsics.ll b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
index 64b22a5cc71bcc..e286eb3226e46f 100644
--- a/llvm/test/CodeGen/ARM/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/ARM/fp-intrinsics.ll
@@ -139,6 +139,13 @@ define float @cos_f32(float %x) #0 {
   ret float %val
 }
 
+; CHECK-LABEL: tan_f32:
+; CHECK: bl tanf
+define float @tan_f32(float %x) #0 {
+  %val = call float @llvm.experimental.constrained.tan.f32(float %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret float %val
+}
+
 ; CHECK-LABEL: pow_f32:
 ; CHECK: bl powf
 define float @pow_f32(float %x, float %y) #0 {
@@ -596,6 +603,13 @@ define double @cos_f64(double %x) #0 {
   ret double %val
 }
 
+; CHECK-LABEL: tan_f64:
+; CHECK: bl tan
+define double @tan_f64(double %x) #0 {
+  %val = call double @llvm.experimental.constrained.tan.f64(double %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0
+  ret double %val
+}
+
 ; CHECK-LABEL: pow_f64:
 ; CHECK: bl pow
 define double @pow_f64(double %x, double %y) #0 {
@@ -1023,6 +1037,7 @@ declare float @llvm.experimental.constrained.sqrt.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.powi.f32(float, i32, metadata, metadata)
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.cos.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata)
@@ -1056,6 +1071,7 @@ declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadat
 declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
 declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.log10.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
index 50ebe0471dceac..402ecb763d5b33 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-constrained-fp.ll
@@ -83,5 +83,55 @@ exit:
   ret void
 }
 
+; Check constrained ops converted to call
+define void @testTan(ptr %cast) strictfp {
+; CHECK-LABEL: testTan:
+; CHECK:       # %bb.0: # %root
+; CHECK-NEXT:    mflr 0
+; CHECK-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    .cfi_offset r29, -24
+; CHECK-NEXT:    .cfi_offset r30, -16
+; CHECK-NEXT:    std 29, -24(1) # 8-byte Folded Spill
+; CHECK-NEXT:    std 30, -16(1) # 8-byte Folded Spill
+; CHECK-NEXT:    stdu 1, -64(1)
+; CHECK-NEXT:    addi 30, 3, -8
+; CHECK-NEXT:    li 29, 255
+; CHECK-NEXT:    std 0, 80(1)
+; CHECK-NEXT:    .p2align 5
+; CHECK-NEXT:  .LBB2_1: # %for.body
+; CHECK-NEXT:    #
+; CHECK-NEXT:    lfdu 1, 8(30)
+; CHECK-NEXT:    bl tan
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi 29, 29, -1
+; CHECK-NEXT:    stfd 1, 0(30)
+; CHECK-NEXT:    cmpldi 29, 0
+; CHECK-NEXT:    bc 12, 1, .LBB2_1
+; CHECK-NEXT:  # %bb.2: # %exit
+; CHECK-NEXT:    addi 1, 1, 64
+; CHECK-NEXT:    ld 0, 16(1)
+; CHECK-NEXT:    ld 30, -16(1) # 8-byte Folded Reload
+; CHECK-NEXT:    ld 29, -24(1) # 8-byte Folded Reload
+; CHECK-NEXT:    mtlr 0
+; CHECK-NEXT:    blr
+root:
+  br label %for.body
+
+exit:
+  ret void
+
+for.body:
+  %i = phi i64 [ 0, %root ], [ %next, %for.body ]
+  %idx = getelementptr inbounds double, ptr %cast, i64 %i
+  %val = load double, ptr %idx
+  %tan = tail call nnan ninf nsz arcp double @llvm.experimental.constrained.tan.f64(double %val, metadata !"round.dynamic", metadata !"fpexcept.strict")
+  store double %tan, ptr %idx, align 8
+  %next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %next, 255
+  br i1 %cond, label %exit, label %for.body
+}
+
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.sqrt.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
index 42972fe069df6a..76f3dea5b7751d 100644
--- a/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/ppcf128-constrained-fp-intrinsics.ll
@@ -2066,6 +2066,50 @@ entry:
   ret i1 %conv
 }
 
+define ppc_fp128 @test_tan_ppc_fp128(ppc_fp128 %first) #0 {
+; PC64LE-LABEL: test_tan_ppc_fp128:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    std 0, 48(1)
+; PC64LE-NEXT:    bl tanl
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: test_tan_ppc_fp128:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    std 0, 48(1)
+; PC64LE9-NEXT:    bl tanl
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+;
+; PC64-LABEL: test_tan_ppc_fp128:
+; PC64:       # %bb.0: # %entry
+; PC64-NEXT:    mflr 0
+; PC64-NEXT:    stdu 1, -112(1)
+; PC64-NEXT:    std 0, 128(1)
+; PC64-NEXT:    bl tanl
+; PC64-NEXT:    nop
+; PC64-NEXT:    addi 1, 1, 112
+; PC64-NEXT:    ld 0, 16(1)
+; PC64-NEXT:    mtlr 0
+; PC64-NEXT:    blr
+entry:
+  %tan = call ppc_fp128 @llvm.experimental.constrained.tan.ppcf128(
+                    ppc_fp128 %first,
+                    metadata !"round.dynamic",
+                    metadata !"fpexcept.strict") #1
+  ret ppc_fp128 %tan
+}
+
 attributes #0 = { nounwind strictfp }
 attributes #1 = { strictfp }
 
@@ -2096,6 +2140,7 @@ declare ppc_fp128 @llvm.experimental.constrained.round.ppcf128(ppc_fp128, metada
 declare ppc_fp128 @llvm.experimental.constrained.sin.ppcf128(ppc_fp128, metadata, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.sqrt.ppcf128(ppc_fp128, metadata, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.fsub.ppcf128(ppc_fp128, ppc_fp128, metadata, metadata)
+declare ppc_fp128 @llvm.experimental.constrained.tan.ppcf128(ppc_fp128, metadata, metadata)
 declare ppc_fp128 @llvm.experimental.constrained.trunc.ppcf128(ppc_fp128, metadata)
 declare i64 @llvm.experimental.constrained.fptosi.i64.ppcf128(ppc_fp128, metadata)
 declare i32 @llvm.experimental.constrained.fptosi.i32.ppcf128(ppc_fp128, metadata)
diff --git a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
index 9cabe0c17d849d..f217162782bfd9 100644
--- a/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/PowerPC/vector-constrained-fp-intrinsics.ll
@@ -8302,6 +8302,357 @@ entry:
   ret <4 x float> %result
 }
 
+define <1 x float> @constrained_vector_tan_v1f32(<1 x float> %x) #0 {
+; PC64LE-LABEL: constrained_vector_tan_v1f32:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -32(1)
+; PC64LE-NEXT:    std 0, 48(1)
+; PC64LE-NEXT:    bl tanf
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    addi 1, 1, 32
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_tan_v1f32:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -32(1)
+; PC64LE9-NEXT:    std 0, 48(1)
+; PC64LE9-NEXT:    bl tanf
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    addi 1, 1, 32
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %tan = call <1 x float> @llvm.experimental.constrained.tan.v1f32(
+                             <1 x float> %x,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #1
+  ret <1 x float> %tan
+}
+
+define <2 x double> @constrained_vector_tan_v2f64(<2 x double> %x) #0 {
+; PC64LE-LABEL: constrained_vector_tan_v2f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -80(1)
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    std 0, 96(1)
+; PC64LE-NEXT:    stxvd2x 62, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 31, 2
+; PC64LE-NEXT:    xxlor 1, 63, 63
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 62, 1, 1
+; PC64LE-NEXT:    xxswapd 1, 63
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE-NEXT:    xxmrghd 34, 62, 1
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    addi 1, 1, 80
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_tan_v2f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -64(1)
+; PC64LE9-NEXT:    std 0, 80(1)
+; PC64LE9-NEXT:    stxv 63, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    vmr 31, 2
+; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
+; PC64LE9-NEXT:    stxv 62, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 62, 1, 1
+; PC64LE9-NEXT:    xxswapd 1, 63
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT:    xxmrghd 34, 62, 1
+; PC64LE9-NEXT:    lxv 63, 48(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 62, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    addi 1, 1, 64
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %tan = call <2 x double> @llvm.experimental.constrained.tan.v2f64(
+                             <2 x double> %x,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #1
+  ret <2 x double> %tan
+}
+
+define <3 x float> @constrained_vector_tan_v3f32(<3 x float> %x) #0 {
+; PC64LE-LABEL: constrained_vector_tan_v3f32:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -80(1)
+; PC64LE-NEXT:    xxsldwi 0, 34, 34, 1
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    std 0, 96(1)
+; PC64LE-NEXT:    stfd 30, 64(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    xscvspdpn 1, 0
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 31, 2
+; PC64LE-NEXT:    bl tanf
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxswapd 0, 63
+; PC64LE-NEXT:    fmr 31, 1
+; PC64LE-NEXT:    xscvspdpn 1, 0
+; PC64LE-NEXT:    bl tanf
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxsldwi 0, 63, 63, 3
+; PC64LE-NEXT:    fmr 30, 1
+; PC64LE-NEXT:    xscvspdpn 1, 0
+; PC64LE-NEXT:    bl tanf
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xscvdpspn 0, 1
+; PC64LE-NEXT:    xscvdpspn 1, 30
+; PC64LE-NEXT:    addis 3, 2, .LCPI189_0@toc@ha
+; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    xscvdpspn 36, 31
+; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    addi 3, 3, .LCPI189_0@toc@l
+; PC64LE-NEXT:    xxmrghw 34, 1, 0
+; PC64LE-NEXT:    lxvd2x 0, 0, 3
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    xxswapd 35, 0
+; PC64LE-NEXT:    vperm 2, 4, 2, 3
+; PC64LE-NEXT:    addi 1, 1, 80
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_tan_v3f32:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -64(1)
+; PC64LE9-NEXT:    xxsldwi 0, 34, 34, 1
+; PC64LE9-NEXT:    std 0, 80(1)
+; PC64LE9-NEXT:    stfd 30, 48(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stxv 63, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stfd 31, 56(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    vmr 31, 2
+; PC64LE9-NEXT:    xscvspdpn 1, 0
+; PC64LE9-NEXT:    bl tanf
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxswapd 0, 63
+; PC64LE9-NEXT:    fmr 31, 1
+; PC64LE9-NEXT:    xscvspdpn 1, 0
+; PC64LE9-NEXT:    bl tanf
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xxsldwi 0, 63, 63, 3
+; PC64LE9-NEXT:    fmr 30, 1
+; PC64LE9-NEXT:    xscvspdpn 1, 0
+; PC64LE9-NEXT:    bl tanf
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscvdpspn 0, 1
+; PC64LE9-NEXT:    xscvdpspn 1, 30
+; PC64LE9-NEXT:    addis 3, 2, .LCPI189_0@toc@ha
+; PC64LE9-NEXT:    xscvdpspn 34, 31
+; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    addi 3, 3, .LCPI189_0@toc@l
+; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    xxmrghw 35, 1, 0
+; PC64LE9-NEXT:    lxv 0, 0(3)
+; PC64LE9-NEXT:    xxperm 34, 35, 0
+; PC64LE9-NEXT:    addi 1, 1, 64
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %tan = call <3 x float> @llvm.experimental.constrained.tan.v3f32(
+                              <3 x float> %x,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #1
+  ret <3 x float> %tan
+}
+
+define <3 x double> @constrained_vector_tan_v3f64(<3 x double> %x) #0 {
+; PC64LE-LABEL: constrained_vector_tan_v3f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -80(1)
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    std 0, 96(1)
+; PC64LE-NEXT:    stfd 30, 64(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    fmr 30, 2
+; PC64LE-NEXT:    stfd 31, 72(1) # 8-byte Folded Spill
+; PC64LE-NEXT:    fmr 31, 3
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 63, 1, 1
+; PC64LE-NEXT:    fmr 1, 30
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE-NEXT:    xxmrghd 63, 1, 63
+; PC64LE-NEXT:    fmr 1, 31
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    fmr 3, 1
+; PC64LE-NEXT:    xxswapd 1, 63
+; PC64LE-NEXT:    lfd 31, 72(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    xxlor 2, 63, 63
+; PC64LE-NEXT:    lfd 30, 64(1) # 8-byte Folded Reload
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    addi 1, 1, 80
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_tan_v3f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -64(1)
+; PC64LE9-NEXT:    std 0, 80(1)
+; PC64LE9-NEXT:    stfd 30, 48(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    stxv 63, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stfd 31, 56(1) # 8-byte Folded Spill
+; PC64LE9-NEXT:    fmr 31, 3
+; PC64LE9-NEXT:    fmr 30, 2
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 63, 1, 1
+; PC64LE9-NEXT:    fmr 1, 30
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT:    xxmrghd 63, 1, 63
+; PC64LE9-NEXT:    fmr 1, 31
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    fmr 3, 1
+; PC64LE9-NEXT:    xxswapd 1, 63
+; PC64LE9-NEXT:    xscpsgndp 2, 63, 63
+; PC64LE9-NEXT:    lxv 63, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lfd 31, 56(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    lfd 30, 48(1) # 8-byte Folded Reload
+; PC64LE9-NEXT:    addi 1, 1, 64
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %tan = call <3 x double> @llvm.experimental.constrained.tan.v3f64(
+                          <3 x double> %x,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict") #1
+  ret <3 x double> %tan
+}
+
+define <4 x double> @constrained_vector_tan_v4f64(<4 x double> %x) #0 {
+; PC64LE-LABEL: constrained_vector_tan_v4f64:
+; PC64LE:       # %bb.0: # %entry
+; PC64LE-NEXT:    mflr 0
+; PC64LE-NEXT:    stdu 1, -96(1)
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    std 0, 112(1)
+; PC64LE-NEXT:    stxvd2x 61, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    stxvd2x 62, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 30, 2
+; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    xxlor 1, 62, 62
+; PC64LE-NEXT:    stxvd2x 63, 1, 3 # 16-byte Folded Spill
+; PC64LE-NEXT:    vmr 31, 3
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 61, 1, 1
+; PC64LE-NEXT:    xxswapd 1, 62
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE-NEXT:    xxmrghd 62, 61, 1
+; PC64LE-NEXT:    xxlor 1, 63, 63
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    xxlor 61, 1, 1
+; PC64LE-NEXT:    xxswapd 1, 63
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE-NEXT:    bl tan
+; PC64LE-NEXT:    nop
+; PC64LE-NEXT:    li 3, 80
+; PC64LE-NEXT:    vmr 2, 30
+; PC64LE-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE-NEXT:    xxmrghd 35, 61, 1
+; PC64LE-NEXT:    lxvd2x 63, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 64
+; PC64LE-NEXT:    lxvd2x 62, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    li 3, 48
+; PC64LE-NEXT:    lxvd2x 61, 1, 3 # 16-byte Folded Reload
+; PC64LE-NEXT:    addi 1, 1, 96
+; PC64LE-NEXT:    ld 0, 16(1)
+; PC64LE-NEXT:    mtlr 0
+; PC64LE-NEXT:    blr
+;
+; PC64LE9-LABEL: constrained_vector_tan_v4f64:
+; PC64LE9:       # %bb.0: # %entry
+; PC64LE9-NEXT:    mflr 0
+; PC64LE9-NEXT:    stdu 1, -80(1)
+; PC64LE9-NEXT:    std 0, 96(1)
+; PC64LE9-NEXT:    stxv 62, 48(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    vmr 30, 2
+; PC64LE9-NEXT:    xscpsgndp 1, 62, 62
+; PC64LE9-NEXT:    stxv 61, 32(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    stxv 63, 64(1) # 16-byte Folded Spill
+; PC64LE9-NEXT:    vmr 31, 3
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
+; PC64LE9-NEXT:    xxswapd 1, 62
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT:    xxmrghd 62, 61, 1
+; PC64LE9-NEXT:    xscpsgndp 1, 63, 63
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    xscpsgndp 61, 1, 1
+; PC64LE9-NEXT:    xxswapd 1, 63
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 killed $vsl1
+; PC64LE9-NEXT:    bl tan
+; PC64LE9-NEXT:    nop
+; PC64LE9-NEXT:    # kill: def $f1 killed $f1 def $vsl1
+; PC64LE9-NEXT:    xxmrghd 35, 61, 1
+; PC64LE9-NEXT:    vmr 2, 30
+; PC64LE9-NEXT:    lxv 63, 64(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 62, 48(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    lxv 61, 32(1) # 16-byte Folded Reload
+; PC64LE9-NEXT:    addi 1, 1, 80
+; PC64LE9-NEXT:    ld 0, 16(1)
+; PC64LE9-NEXT:    mtlr 0
+; PC64LE9-NEXT:    blr
+entry:
+  %tan = call <4 x double> @llvm.experimental.constrained.tan.v4f64(
+                             <4 x double> %x,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #1
+  ret <4 x double> %tan
+}
+
 attributes #0 = { nounwind strictfp noimplicitfloat }
 attributes #1 = { strictfp }
 
@@ -8316,6 +8667,7 @@ declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x
 declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata)
@@ -8361,6 +8713,7 @@ declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x f
 declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata)
@@ -8414,6 +8767,8 @@ declare <3 x float> @llvm.experimental.constrained.sin.v3f32(<3 x float>, metada
 declare <3 x double> @llvm.experimental.constrained.sin.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata)
@@ -8470,6 +8825,7 @@ declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x
 declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata)
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 1c9356fb1a05a7..86b557700347e1 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV32ZMMUL %s
 ; RUN: llc -mtriple=riscv32 -mattr=+m,+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV32MZMMUL %s
 ; RUN: llc -mtriple=riscv32 -mattr=+a %s -o - | FileCheck --check-prefixes=CHECK,RV32A %s
+; RUN: llc -mtriple=riscv32 -mattr=+b %s -o - | FileCheck --check-prefixes=CHECK,RV32B %s
 ; RUN: llc -mtriple=riscv32 -mattr=+f %s -o - | FileCheck --check-prefixes=CHECK,RV32F %s
 ; RUN: llc -mtriple=riscv32 -mattr=+d %s -o - | FileCheck --check-prefixes=CHECK,RV32D %s
 ; RUN: llc -mtriple=riscv32 -mattr=+c %s -o - | FileCheck --check-prefixes=CHECK,RV32C %s
@@ -131,6 +132,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV64ZMMUL %s
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+zmmul %s -o - | FileCheck --check-prefixes=CHECK,RV64MZMMUL %s
 ; RUN: llc -mtriple=riscv64 -mattr=+a %s -o - | FileCheck --check-prefixes=CHECK,RV64A %s
+; RUN: llc -mtriple=riscv64 -mattr=+b %s -o - | FileCheck --check-prefixes=CHECK,RV64B %s
 ; RUN: llc -mtriple=riscv64 -mattr=+f %s -o - | FileCheck --check-prefixes=CHECK,RV64F %s
 ; RUN: llc -mtriple=riscv64 -mattr=+d %s -o - | FileCheck --check-prefixes=CHECK,RV64D %s
 ; RUN: llc -mtriple=riscv64 -mattr=+c %s -o - | FileCheck --check-prefixes=CHECK,RV64C %s
@@ -277,6 +279,7 @@
 ; RV32ZMMUL: .attribute 5, "rv32i2p1_zmmul1p0"
 ; RV32MZMMUL: .attribute 5, "rv32i2p1_m2p0_zmmul1p0"
 ; RV32A: .attribute 5, "rv32i2p1_a2p1"
+; RV32B: .attribute 5, "rv32i2p1_b1p0_zba1p0_zbb1p0_zbs1p0"
 ; RV32F: .attribute 5, "rv32i2p1_f2p2_zicsr2p0"
 ; RV32D: .attribute 5, "rv32i2p1_f2p2_d2p2_zicsr2p0"
 ; RV32C: .attribute 5, "rv32i2p1_c2p0"
@@ -402,6 +405,7 @@
 ; RV64ZMMUL: .attribute 5, "rv64i2p1_zmmul1p0"
 ; RV64MZMMUL: .attribute 5, "rv64i2p1_m2p0_zmmul1p0"
 ; RV64A: .attribute 5, "rv64i2p1_a2p1"
+; RV64B: .attribute 5, "rv64i2p1_b1p0_zba1p0_zbb1p0_zbs1p0"
 ; RV64F: .attribute 5, "rv64i2p1_f2p2_zicsr2p0"
 ; RV64D: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0"
 ; RV64C: .attribute 5, "rv64i2p1_c2p0"
diff --git a/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll b/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll
index 38215860193eaf..4cb6191e7322e9 100644
--- a/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll
+++ b/llvm/test/CodeGen/RISCV/double-intrinsics-strict.ll
@@ -375,6 +375,66 @@ define double @sincos_f64(double %a) nounwind strictfp {
   ret double %3
 }
 
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
+
+define double @tan_f64(double %a) nounwind strictfp {
+; RV32IFD-LABEL: tan_f64:
+; RV32IFD:       # %bb.0:
+; RV32IFD-NEXT:    addi sp, sp, -16
+; RV32IFD-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IFD-NEXT:    call tan
+; RV32IFD-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IFD-NEXT:    addi sp, sp, 16
+; RV32IFD-NEXT:    ret
+;
+; RV64IFD-LABEL: tan_f64:
+; RV64IFD:       # %bb.0:
+; RV64IFD-NEXT:    addi sp, sp, -16
+; RV64IFD-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IFD-NEXT:    call tan
+; RV64IFD-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IFD-NEXT:    addi sp, sp, 16
+; RV64IFD-NEXT:    ret
+;
+; RV32IZFINXZDINX-LABEL: tan_f64:
+; RV32IZFINXZDINX:       # %bb.0:
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, -16
+; RV32IZFINXZDINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFINXZDINX-NEXT:    call tan
+; RV32IZFINXZDINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFINXZDINX-NEXT:    addi sp, sp, 16
+; RV32IZFINXZDINX-NEXT:    ret
+;
+; RV64IZFINXZDINX-LABEL: tan_f64:
+; RV64IZFINXZDINX:       # %bb.0:
+; RV64IZFINXZDINX-NEXT:    addi sp, sp, -16
+; RV64IZFINXZDINX-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFINXZDINX-NEXT:    call tan
+; RV64IZFINXZDINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFINXZDINX-NEXT:    addi sp, sp, 16
+; RV64IZFINXZDINX-NEXT:    ret
+;
+; RV32I-LABEL: tan_f64:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call tan
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: tan_f64:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call tan
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %1 = call double @llvm.experimental.constrained.tan.f64(double %a, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
+  ret double %1
+}
+
 declare double @llvm.experimental.constrained.pow.f64(double, double, metadata, metadata)
 
 define double @pow_f64(double %a, double %b) nounwind strictfp {
diff --git a/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll b/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll
index 626db1985bfc7e..e4be5074cb800a 100644
--- a/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll
+++ b/llvm/test/CodeGen/RISCV/float-intrinsics-strict.ll
@@ -354,6 +354,66 @@ define float @sincos_f32(float %a) nounwind strictfp {
   ret float %3
 }
 
+declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
+
+define float @tan_f32(float %a) nounwind strictfp {
+; RV32IF-LABEL: tan_f32:
+; RV32IF:       # %bb.0:
+; RV32IF-NEXT:    addi sp, sp, -16
+; RV32IF-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IF-NEXT:    call tanf
+; RV32IF-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IF-NEXT:    addi sp, sp, 16
+; RV32IF-NEXT:    ret
+;
+; RV64IF-LABEL: tan_f32:
+; RV64IF:       # %bb.0:
+; RV64IF-NEXT:    addi sp, sp, -16
+; RV64IF-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IF-NEXT:    call tanf
+; RV64IF-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IF-NEXT:    addi sp, sp, 16
+; RV64IF-NEXT:    ret
+;
+; RV32IZFINX-LABEL: tan_f32:
+; RV32IZFINX:       # %bb.0:
+; RV32IZFINX-NEXT:    addi sp, sp, -16
+; RV32IZFINX-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32IZFINX-NEXT:    call tanf
+; RV32IZFINX-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32IZFINX-NEXT:    addi sp, sp, 16
+; RV32IZFINX-NEXT:    ret
+;
+; RV64IZFINX-LABEL: tan_f32:
+; RV64IZFINX:       # %bb.0:
+; RV64IZFINX-NEXT:    addi sp, sp, -16
+; RV64IZFINX-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64IZFINX-NEXT:    call tanf
+; RV64IZFINX-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64IZFINX-NEXT:    addi sp, sp, 16
+; RV64IZFINX-NEXT:    ret
+;
+; RV32I-LABEL: tan_f32:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    addi sp, sp, -16
+; RV32I-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; RV32I-NEXT:    call tanf
+; RV32I-NEXT:    lw ra, 12(sp) # 4-byte Folded Reload
+; RV32I-NEXT:    addi sp, sp, 16
+; RV32I-NEXT:    ret
+;
+; RV64I-LABEL: tan_f32:
+; RV64I:       # %bb.0:
+; RV64I-NEXT:    addi sp, sp, -16
+; RV64I-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; RV64I-NEXT:    call tanf
+; RV64I-NEXT:    ld ra, 8(sp) # 8-byte Folded Reload
+; RV64I-NEXT:    addi sp, sp, 16
+; RV64I-NEXT:    ret
+  %1 = call float @llvm.experimental.constrained.tan.f32(float %a, metadata !"round.dynamic", metadata !"fpexcept.strict") strictfp
+  ret float %1
+}
+
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 
 define float @pow_f32(float %a, float %b) nounwind strictfp {
diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll
index b3fcdc978625f9..e7b1b441405f61 100644
--- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll
+++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-break.ll
@@ -66,7 +66,7 @@ while.end:
 
 ; CHECK:   %[[#new_end]] = OpLabel
 ; CHECK:    %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_1]] %[[#while_cond]] %[[#int_0]] %[[#while_body]]
-; CHECK:                   OpSwitch %[[#route]] %[[#while_end_loopexit]] 0 %[[#if_then]]
+; CHECK:                   OpSwitch %[[#route]] %[[#if_then]] 1 %[[#while_end_loopexit]]
 }
 
 declare token @llvm.experimental.convergence.entry() #2
diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll
index a67c58fdd5749f..593e3631c02b9d 100644
--- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll
+++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-convergence-in-break.ll
@@ -75,7 +75,7 @@ while.end:
 
 ; CHECK:   %[[#new_end]] = OpLabel
 ; CHECK:    %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_0]] %[[#while_cond]] %[[#int_1]] %[[#tail]]
-; CHECK:                   OpSwitch %[[#route]] %[[#while_end]] 0 %[[#while_end_loopexit]]
+; CHECK:                   OpSwitch %[[#route]] %[[#while_end_loopexit]] 1 %[[#while_end]]
 }
 
 declare token @llvm.experimental.convergence.entry() #2
diff --git a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll
index 32a97553df05e3..9806dd7955468e 100644
--- a/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll
+++ b/llvm/test/CodeGen/SPIRV/structurizer/merge-exit-multiple-break.ll
@@ -85,7 +85,7 @@ while.end:
 
 ; CHECK:   %[[#new_end]] = OpLabel
 ; CHECK:    %[[#route:]] = OpPhi %[[#int_ty]] %[[#int_2]] %[[#while_cond]] %[[#int_0]] %[[#while_body]] %[[#int_1]] %[[#if_end]]
-; CHECK:                   OpSwitch %[[#route]] %[[#while_end_loopexit]] 1 %[[#if_then2]] 0 %[[#if_then]]
+; CHECK:                   OpSwitch %[[#route]] %[[#if_then]] 1 %[[#if_then2]] 2 %[[#while_end_loopexit]]
 }
 
 declare token @llvm.experimental.convergence.entry() #2
diff --git a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
index 9d77744f18ca1a..4a109ee96a3d3e 100644
--- a/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/SystemZ/vector-constrained-fp-intrinsics.ll
@@ -6222,6 +6222,323 @@ entry:
   ret void
 }
 
+define <1 x float> @constrained_vector_tan_v1f32() #0 {
+; S390X-LABEL: constrained_vector_tan_v1f32:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -160
+; S390X-NEXT:    .cfi_def_cfa_offset 320
+; S390X-NEXT:    larl %r1, .LCPI119_0
+; S390X-NEXT:    le %f0, 0(%r1)
+; S390X-NEXT:    brasl %r14, tanf@PLT
+; S390X-NEXT:    lmg %r14, %r15, 272(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_tan_v1f32:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -160
+; SZ13-NEXT:    .cfi_def_cfa_offset 320
+; SZ13-NEXT:    larl %r1, .LCPI119_0
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tanf@PLT
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vlr %v24, %v0
+; SZ13-NEXT:    lmg %r14, %r15, 272(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %tan = call <1 x float> @llvm.experimental.constrained.tan.v1f32(
+                             <1 x float> <float 42.0>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <1 x float> %tan
+}
+
+define <2 x double> @constrained_vector_tan_v2f64() #0 {
+; S390X-LABEL: constrained_vector_tan_v2f64:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -168
+; S390X-NEXT:    .cfi_def_cfa_offset 328
+; S390X-NEXT:    std %f8, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    larl %r1, .LCPI120_0
+; S390X-NEXT:    ld %f0, 0(%r1)
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    larl %r1, .LCPI120_1
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    ldr %f8, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    ldr %f2, %f8
+; S390X-NEXT:    ld %f8, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r14, %r15, 280(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_tan_v2f64:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -176
+; SZ13-NEXT:    .cfi_def_cfa_offset 336
+; SZ13-NEXT:    larl %r1, .LCPI120_0
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    larl %r1, .LCPI120_1
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v24, %v0, %v1
+; SZ13-NEXT:    lmg %r14, %r15, 288(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %tan = call <2 x double> @llvm.experimental.constrained.tan.v2f64(
+                             <2 x double> <double 42.0, double 42.1>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <2 x double> %tan
+}
+
+define <3 x float> @constrained_vector_tan_v3f32() #0 {
+; S390X-LABEL: constrained_vector_tan_v3f32:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -176
+; S390X-NEXT:    .cfi_def_cfa_offset 336
+; S390X-NEXT:    std %f8, 168(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f9, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    .cfi_offset %f9, -176
+; S390X-NEXT:    larl %r1, .LCPI121_0
+; S390X-NEXT:    le %f0, 0(%r1)
+; S390X-NEXT:    brasl %r14, tanf@PLT
+; S390X-NEXT:    larl %r1, .LCPI121_1
+; S390X-NEXT:    le %f1, 0(%r1)
+; S390X-NEXT:    ler %f8, %f0
+; S390X-NEXT:    ler %f0, %f1
+; S390X-NEXT:    brasl %r14, tanf@PLT
+; S390X-NEXT:    larl %r1, .LCPI121_2
+; S390X-NEXT:    le %f1, 0(%r1)
+; S390X-NEXT:    ler %f9, %f0
+; S390X-NEXT:    ler %f0, %f1
+; S390X-NEXT:    brasl %r14, tanf@PLT
+; S390X-NEXT:    ler %f2, %f9
+; S390X-NEXT:    ler %f4, %f8
+; S390X-NEXT:    ld %f8, 168(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f9, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r14, %r15, 288(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_tan_v3f32:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -192
+; SZ13-NEXT:    .cfi_def_cfa_offset 352
+; SZ13-NEXT:    larl %r1, .LCPI121_0
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tanf@PLT
+; SZ13-NEXT:    larl %r1, .LCPI121_1
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vst %v0, 176(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tanf@PLT
+; SZ13-NEXT:    larl %r1, .LCPI121_2
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    lde %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tanf@PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0s killed $f0s def $v0
+; SZ13-NEXT:    vmrhf %v0, %v1, %v0
+; SZ13-NEXT:    vl %v1, 176(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    vrepf %v1, %v1, 0
+; SZ13-NEXT:    vmrhg %v24, %v0, %v1
+; SZ13-NEXT:    lmg %r14, %r15, 304(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %tan = call <3 x float> @llvm.experimental.constrained.tan.v3f32(
+                              <3 x float> <float 42.0, float 43.0, float 44.0>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #0
+  ret <3 x float> %tan
+}
+
+define void @constrained_vector_tan_v3f64(ptr %a) #0 {
+; S390X-LABEL: constrained_vector_tan_v3f64:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r13, %r15, 104(%r15)
+; S390X-NEXT:    .cfi_offset %r13, -56
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -184
+; S390X-NEXT:    .cfi_def_cfa_offset 344
+; S390X-NEXT:    std %f8, 176(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f9, 168(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f10, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    .cfi_offset %f9, -176
+; S390X-NEXT:    .cfi_offset %f10, -184
+; S390X-NEXT:    lgr %r13, %r2
+; S390X-NEXT:    ld %f8, 0(%r2)
+; S390X-NEXT:    ld %f0, 16(%r2)
+; S390X-NEXT:    ld %f9, 8(%r2)
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    ldr %f10, %f0
+; S390X-NEXT:    ldr %f0, %f9
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    ldr %f9, %f0
+; S390X-NEXT:    ldr %f0, %f8
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    std %f0, 0(%r13)
+; S390X-NEXT:    std %f9, 8(%r13)
+; S390X-NEXT:    std %f10, 16(%r13)
+; S390X-NEXT:    ld %f8, 176(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f9, 168(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f10, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r13, %r15, 288(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_tan_v3f64:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r13, %r15, 104(%r15)
+; SZ13-NEXT:    .cfi_offset %r13, -56
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -200
+; SZ13-NEXT:    .cfi_def_cfa_offset 360
+; SZ13-NEXT:    std %f8, 192(%r15) # 8-byte Folded Spill
+; SZ13-NEXT:    .cfi_offset %f8, -168
+; SZ13-NEXT:    vl %v0, 0(%r2), 4
+; SZ13-NEXT:    ld %f8, 16(%r2)
+; SZ13-NEXT:    lgr %r13, %r2
+; SZ13-NEXT:    vst %v0, 176(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    # kill: def $f0d killed $f0d killed $v0
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    vl %v0, 176(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    vrepg %v0, %v0, 1
+; SZ13-NEXT:    # kill: def $f0d killed $f0d killed $v0
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v0, %v1, %v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ldr %f0, %f8
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    std %f0, 16(%r13)
+; SZ13-NEXT:    vl %v0, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    ld %f8, 192(%r15) # 8-byte Folded Reload
+; SZ13-NEXT:    vst %v0, 0(%r13), 4
+; SZ13-NEXT:    lmg %r13, %r15, 304(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %b = load <3 x double>, ptr %a
+  %tan = call <3 x double> @llvm.experimental.constrained.tan.v3f64(
+                          <3 x double> %b,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict") #0
+  store <3 x double> %tan, ptr %a
+  ret void
+}
+
+define <4 x double> @constrained_vector_tan_v4f64() #0 {
+; S390X-LABEL: constrained_vector_tan_v4f64:
+; S390X:       # %bb.0: # %entry
+; S390X-NEXT:    stmg %r14, %r15, 112(%r15)
+; S390X-NEXT:    .cfi_offset %r14, -48
+; S390X-NEXT:    .cfi_offset %r15, -40
+; S390X-NEXT:    aghi %r15, -184
+; S390X-NEXT:    .cfi_def_cfa_offset 344
+; S390X-NEXT:    std %f8, 176(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f9, 168(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    std %f10, 160(%r15) # 8-byte Folded Spill
+; S390X-NEXT:    .cfi_offset %f8, -168
+; S390X-NEXT:    .cfi_offset %f9, -176
+; S390X-NEXT:    .cfi_offset %f10, -184
+; S390X-NEXT:    larl %r1, .LCPI123_0
+; S390X-NEXT:    ld %f0, 0(%r1)
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    larl %r1, .LCPI123_1
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    ldr %f8, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    larl %r1, .LCPI123_2
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    ldr %f9, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    larl %r1, .LCPI123_3
+; S390X-NEXT:    ld %f1, 0(%r1)
+; S390X-NEXT:    ldr %f10, %f0
+; S390X-NEXT:    ldr %f0, %f1
+; S390X-NEXT:    brasl %r14, tan@PLT
+; S390X-NEXT:    ldr %f2, %f10
+; S390X-NEXT:    ldr %f4, %f9
+; S390X-NEXT:    ldr %f6, %f8
+; S390X-NEXT:    ld %f8, 176(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f9, 168(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    ld %f10, 160(%r15) # 8-byte Folded Reload
+; S390X-NEXT:    lmg %r14, %r15, 296(%r15)
+; S390X-NEXT:    br %r14
+;
+; SZ13-LABEL: constrained_vector_tan_v4f64:
+; SZ13:       # %bb.0: # %entry
+; SZ13-NEXT:    stmg %r14, %r15, 112(%r15)
+; SZ13-NEXT:    .cfi_offset %r14, -48
+; SZ13-NEXT:    .cfi_offset %r15, -40
+; SZ13-NEXT:    aghi %r15, -192
+; SZ13-NEXT:    .cfi_def_cfa_offset 352
+; SZ13-NEXT:    larl %r1, .LCPI123_0
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    larl %r1, .LCPI123_1
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    vl %v1, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v0, %v0, %v1
+; SZ13-NEXT:    larl %r1, .LCPI123_2
+; SZ13-NEXT:    vst %v0, 160(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    larl %r1, .LCPI123_3
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vst %v0, 176(%r15), 3 # 16-byte Folded Spill
+; SZ13-NEXT:    ld %f0, 0(%r1)
+; SZ13-NEXT:    brasl %r14, tan@PLT
+; SZ13-NEXT:    vl %v1, 176(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    vl %v24, 160(%r15), 3 # 16-byte Folded Reload
+; SZ13-NEXT:    # kill: def $f0d killed $f0d def $v0
+; SZ13-NEXT:    vmrhg %v26, %v0, %v1
+; SZ13-NEXT:    lmg %r14, %r15, 304(%r15)
+; SZ13-NEXT:    br %r14
+entry:
+  %tan = call <4 x double> @llvm.experimental.constrained.tan.v4f64(
+                             <4 x double> <double 42.0, double 42.1,
+                                           double 42.2, double 42.3>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <4 x double> %tan
+}
+
 attributes #0 = { strictfp }
 
 declare <2 x double> @llvm.experimental.constrained.fadd.v2f64(<2 x double>, <2 x double>, metadata, metadata)
@@ -6234,6 +6551,7 @@ declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x
 declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata)
@@ -6260,6 +6578,7 @@ declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x f
 declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata)
@@ -6296,6 +6615,8 @@ declare <3 x float> @llvm.experimental.constrained.sin.v3f32(<3 x float>, metada
 declare <3 x double> @llvm.experimental.constrained.sin.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata)
@@ -6335,6 +6656,7 @@ declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x
 declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata)
diff --git a/llvm/test/CodeGen/WebAssembly/libcalls.ll b/llvm/test/CodeGen/WebAssembly/libcalls.ll
index 4f57c347a1a335..70f000664d388a 100644
--- a/llvm/test/CodeGen/WebAssembly/libcalls.ll
+++ b/llvm/test/CodeGen/WebAssembly/libcalls.ll
@@ -12,6 +12,7 @@ declare fp128 @llvm.nearbyint.f128(fp128)
 declare fp128 @llvm.pow.f128(fp128, fp128)
 declare fp128 @llvm.powi.f128.i32(fp128, i32)
 
+declare double @llvm.tan.f64(double)
 declare double @llvm.cos.f64(double)
 declare double @llvm.log10.f64(double)
 declare double @llvm.pow.f64(double, double)
@@ -240,42 +241,44 @@ define double @f64libcalls(double %x, double %y, i32 %z) {
 ; CHECK:         .functype f64libcalls (f64, f64, i32) -> (f64)
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get $push11=, __stack_pointer
-; CHECK-NEXT:    i32.const $push12=, 16
-; CHECK-NEXT:    i32.sub $push18=, $pop11, $pop12
-; CHECK-NEXT:    local.tee $push17=, 3, $pop18
-; CHECK-NEXT:    global.set __stack_pointer, $pop17
-; CHECK-NEXT:    local.get $push22=, 0
-; CHECK-NEXT:    local.get $push19=, 0
-; CHECK-NEXT:    call $push0=, cos, $pop19
-; CHECK-NEXT:    call $push1=, log10, $pop0
-; CHECK-NEXT:    local.get $push20=, 1
-; CHECK-NEXT:    call $push2=, pow, $pop1, $pop20
-; CHECK-NEXT:    local.get $push21=, 2
-; CHECK-NEXT:    call $push3=, __powidf2, $pop2, $pop21
-; CHECK-NEXT:    call $push4=, log, $pop3
-; CHECK-NEXT:    call $push5=, exp, $pop4
-; CHECK-NEXT:    call $push6=, exp10, $pop5
-; CHECK-NEXT:    call $push7=, cbrt, $pop6
-; CHECK-NEXT:    call $push8=, lround, $pop7
-; CHECK-NEXT:    call $push9=, ldexp, $pop22, $pop8
-; CHECK-NEXT:    local.get $push23=, 3
-; CHECK-NEXT:    i32.const $push15=, 12
-; CHECK-NEXT:    i32.add $push16=, $pop23, $pop15
-; CHECK-NEXT:    call $push24=, frexp, $pop9, $pop16
-; CHECK-NEXT:    local.set 0, $pop24
-; CHECK-NEXT:    local.get $push25=, 3
-; CHECK-NEXT:    i32.load $push10=, 12($pop25)
-; CHECK-NEXT:    call escape_value, $pop10
-; CHECK-NEXT:    local.get $push26=, 3
+; CHECK-NEXT:    global.get $push12=, __stack_pointer
 ; CHECK-NEXT:    i32.const $push13=, 16
-; CHECK-NEXT:    i32.add $push14=, $pop26, $pop13
-; CHECK-NEXT:    global.set __stack_pointer, $pop14
-; CHECK-NEXT:    local.get $push27=, 0
-; CHECK-NEXT:    return $pop27
+; CHECK-NEXT:    i32.sub  $push19=, $pop12, $pop13
+; CHECK-NEXT:    local.tee $push18=, 3, $pop19
+; CHECK-NEXT:    global.set __stack_pointer, $pop18
+; CHECK-NEXT:    local.get $push23=, 0
+; CHECK-NEXT:    local.get $push20=, 0
+; CHECK-NEXT:    call $push0=, tan, $pop20
+; CHECK-NEXT:    call $push1=, cos, $pop0
+; CHECK-NEXT:    call $push2=, log10, $pop1
+; CHECK-NEXT:    local.get $push21=, 1
+; CHECK-NEXT:    call $push3=, pow, $pop2, $pop21
+; CHECK-NEXT:    local.get $push22=, 2
+; CHECK-NEXT:    call $push4=, __powidf2, $pop3, $pop22
+; CHECK-NEXT:    call $push5=, log, $pop4
+; CHECK-NEXT:    call $push6=, exp, $pop5
+; CHECK-NEXT:    call $push7=, exp10, $pop6
+; CHECK-NEXT:    call $push8=, cbrt, $pop7
+; CHECK-NEXT:    call $push9=, lround, $pop8
+; CHECK-NEXT:    call $push10=, ldexp, $pop23, $pop9
+; CHECK-NEXT:    local.get $push24=, 3
+; CHECK-NEXT:    i32.const $push16=, 12
+; CHECK-NEXT:    i32.add  $push17=, $pop24, $pop16
+; CHECK-NEXT:    call $push25=, frexp, $pop10, $pop17
+; CHECK-NEXT:    local.set 0, $pop25
+; CHECK-NEXT:    local.get $push26=, 3
+; CHECK-NEXT:    i32.load $push11=, 12($pop26)
+; CHECK-NEXT:    call escape_value, $pop11
+; CHECK-NEXT:    local.get $push27=, 3
+; CHECK-NEXT:    i32.const $push14=, 16
+; CHECK-NEXT:    i32.add  $push15=, $pop27, $pop14
+; CHECK-NEXT:    global.set __stack_pointer, $pop15
+; CHECK-NEXT:    local.get $push28=, 0
+; CHECK-NEXT:    return $pop28
 
 
- %a = call double @llvm.cos.f64(double %x)
+ %k = call double @llvm.tan.f64(double %x)
+ %a = call double @llvm.cos.f64(double %k)
  %b = call double @llvm.log10.f64(double %a)
  %c = call double @llvm.pow.f64(double %b, double %y)
  %d = call double @llvm.powi.f64.i32(double %c, i32 %z)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir
index e79f4d1f989a91..eef1f43b278d9d 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir
+++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi.mir
@@ -87,7 +87,8 @@ liveins:
   - { reg: '$rsi', virtual-reg: '%14' }
 frameInfo:
   maxAlignment:    1
-machineFunctionInfo: {}
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir
index d47bda0044115e..5843366baab6d1 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir
+++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi2.mir
@@ -34,7 +34,8 @@ liveins:
   - { reg: '$edi', virtual-reg: '%12' }
 frameInfo:
   maxAlignment:    1
-machineFunctionInfo: {}
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir
index 15d3eb6bdfebb6..4eb8b950851895 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir
+++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-phi4.mir
@@ -35,7 +35,8 @@ liveins:
   - { reg: '$edi', virtual-reg: '%12' }
 frameInfo:
   maxAlignment:    1
-machineFunctionInfo: {}
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir
index 98744bbe8e1473..1ed4328bf132a1 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir
+++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig-spill.mir
@@ -23,7 +23,8 @@ frameInfo:
 stack:
   - { id: 0, size: 1024, alignment: 16 }
   - { id: 1, size: 64, alignment: 4 }
-machineFunctionInfo: {}
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   ; CHECK-LABEL: name: foo
   ; CHECK: bb.0.entry:
@@ -100,7 +101,8 @@ frameInfo:
 stack:
   - { id: 0, size: 1024, alignment: 16 }
   - { id: 1, size: 64, alignment: 4 }
-machineFunctionInfo: {}
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   ; CHECK-LABEL: name: copy
   ; CHECK: bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir
index 84fc47a3a91202..561ba6f2f49709 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir
+++ b/llvm/test/CodeGen/X86/AMX/amx-fastconfig.mir
@@ -77,7 +77,8 @@ liveins:
   - { reg: '$edx', virtual-reg: '%11' }
 frameInfo:
   maxAlignment:    1
-machineFunctionInfo: {}
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   ; CHECK-LABEL: name: test_api
   ; CHECK: bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir
index 40566520b79f01..0d56feac626814 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir
+++ b/llvm/test/CodeGen/X86/AMX/amx-fastpreconfig.mir
@@ -23,7 +23,8 @@ frameInfo:
 stack:
   - { id: 0, size: 1024, alignment: 16 }
   - { id: 1, size: 64, alignment: 4 }
-machineFunctionInfo: {}
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: main
@@ -79,6 +80,8 @@ registers:
 liveins:
   - { reg: '$rdi', virtual-reg: '' }
   - { reg: '$rsi', virtual-reg: '' }
+machineFunctionInfo:
+  amxProgModel: ManagedRA
 body:             |
   bb.1.entry:
     liveins: $rdi, $rsi
diff --git a/llvm/test/CodeGen/X86/abs.ll b/llvm/test/CodeGen/X86/abs.ll
index 7642c0c9541429..dde877c5bb61e5 100644
--- a/llvm/test/CodeGen/X86/abs.ll
+++ b/llvm/test/CodeGen/X86/abs.ll
@@ -713,17 +713,15 @@ define i128 @test_sextinreg_i128(i128 %a) nounwind {
 define i8 @test_minsigned_i8(i8 %a0, i8 %a1) nounwind {
 ; X64-LABEL: test_minsigned_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpb $-128, %dil
-; X64-NEXT:    jne .LBB17_1
-; X64-NEXT:  # %bb.2: # %select.end
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB17_1: # %select.false.sink
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    sarb $7, %al
-; X64-NEXT:    xorb %al, %dil
-; X64-NEXT:    subb %al, %dil
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edi, %ecx
+; X64-NEXT:    xorb %al, %cl
+; X64-NEXT:    subb %al, %cl
+; X64-NEXT:    cmpb $-128, %dil
+; X64-NEXT:    movzbl %cl, %eax
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_minsigned_i8:
@@ -731,14 +729,17 @@ define i8 @test_minsigned_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpb $-128, %al
 ; X86-NEXT:    jne .LBB17_1
-; X86-NEXT:  # %bb.2: # %select.end
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
-; X86-NEXT:  .LBB17_1: # %select.false.sink
+; X86-NEXT:  .LBB17_1:
 ; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    sarb $7, %cl
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    subb %cl, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
   %lim = icmp eq i8 %a0, -128
   %abs = tail call i8 @llvm.abs.i8(i8 %a0, i1 false)
@@ -749,30 +750,26 @@ define i8 @test_minsigned_i8(i8 %a0, i8 %a1) nounwind {
 define i16 @test_minsigned_i16(i16 %a0, i16 %a1) nounwind {
 ; X64-LABEL: test_minsigned_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movzwl %di, %eax
-; X64-NEXT:    cmpl $32768, %eax # imm = 0x8000
-; X64-NEXT:    jne .LBB18_1
-; X64-NEXT:  # %bb.2: # %select.end
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB18_1: # %select.false.sink
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movzwl %di, %ecx
+; X64-NEXT:    movl %ecx, %eax
 ; X64-NEXT:    negw %ax
-; X64-NEXT:    cmovsw %di, %ax
+; X64-NEXT:    cmovsw %cx, %ax
+; X64-NEXT:    cmpl $32768, %ecx # imm = 0x8000
+; X64-NEXT:    cmovel %esi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_minsigned_i16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl $32768, %ecx # imm = 0x8000
-; X86-NEXT:    jne .LBB18_1
-; X86-NEXT:  # %bb.2: # %select.end
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB18_1: # %select.false.sink
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negw %ax
 ; X86-NEXT:    cmovsw %cx, %ax
+; X86-NEXT:    cmpl $32768, %ecx # imm = 0x8000
+; X86-NEXT:    jne .LBB18_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB18_2:
 ; X86-NEXT:    retl
   %lim = icmp eq i16 %a0, -32768
   %abs = tail call i16 @llvm.abs.i16(i16 %a0, i1 false)
@@ -783,29 +780,24 @@ define i16 @test_minsigned_i16(i16 %a0, i16 %a1) nounwind {
 define i32 @test_minsigned_i32(i32 %a0, i32 %a1) nounwind {
 ; X64-LABEL: test_minsigned_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
-; X64-NEXT:    jne .LBB19_1
-; X64-NEXT:  # %bb.2: # %select.end
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB19_1: # %select.false.sink
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    negl %eax
 ; X64-NEXT:    cmovsl %edi, %eax
+; X64-NEXT:    cmpl $-2147483648, %edi # imm = 0x80000000
+; X64-NEXT:    cmovel %esi, %eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_minsigned_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl $-2147483648, %ecx # imm = 0x80000000
-; X86-NEXT:    jne .LBB19_1
-; X86-NEXT:  # %bb.2: # %select.end
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    retl
-; X86-NEXT:  .LBB19_1: # %select.false.sink
 ; X86-NEXT:    movl %ecx, %eax
 ; X86-NEXT:    negl %eax
 ; X86-NEXT:    cmovsl %ecx, %eax
+; X86-NEXT:    cmpl $-2147483648, %ecx # imm = 0x80000000
+; X86-NEXT:    jne .LBB19_2
+; X86-NEXT:  # %bb.1:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:  .LBB19_2:
 ; X86-NEXT:    retl
   %lim = icmp eq i32 %a0, -2147483648
   %abs = tail call i32 @llvm.abs.i32(i32 %a0, i1 false)
@@ -816,16 +808,12 @@ define i32 @test_minsigned_i32(i32 %a0, i32 %a1) nounwind {
 define i64 @test_minsigned_i64(i64 %a0, i64 %a1) nounwind {
 ; X64-LABEL: test_minsigned_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
-; X64-NEXT:    cmpq %rax, %rdi
-; X64-NEXT:    jne .LBB20_1
-; X64-NEXT:  # %bb.2: # %select.end
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    retq
-; X64-NEXT:  .LBB20_1: # %select.false.sink
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    negq %rax
 ; X64-NEXT:    cmovsq %rdi, %rax
+; X64-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; X64-NEXT:    cmpq %rcx, %rdi
+; X64-NEXT:    cmoveq %rsi, %rax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_minsigned_i64:
diff --git a/llvm/test/CodeGen/X86/apple-version-min.ll b/llvm/test/CodeGen/X86/apple-version-min.ll
new file mode 100644
index 00000000000000..fde10ac3b42483
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apple-version-min.ll
@@ -0,0 +1,12 @@
+; Test emitting version_min directives.
+
+; RUN: llc %s -filetype=asm -o - --mtriple x86_64-apple-tvos9.0.0-simulator | FileCheck %s --check-prefix=TVOS
+; RUN: llc %s -filetype=asm -o - --mtriple x86_64-apple-tvos9.0.0 | FileCheck %s --check-prefix=TVOS
+; RUN: llc %s -filetype=asm -o - --mtriple x86_64-apple-driverkit19.0.0 | FileCheck %s --check-prefix=DRIVERKIT
+; RUN: llc %s -filetype=asm -o - --mtriple i386-apple-ios7.0.0-simulator | FileCheck %s --check-prefix=IOS
+; RUN: llc %s -filetype=asm -o - --mtriple i386-apple-watchos2.0.0-simulator | FileCheck %s --check-prefix=WATCHOS
+
+; TVOS: .tvos_version_min 9, 0
+; DRIVERKIT: .build_version driverkit, 19, 0
+; IOS: .ios_version_min 7, 0
+; WATCHOS: .watchos_version_min 2, 0
diff --git a/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll b/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll
index 4e07070fba0d1e..054485a358066e 100644
--- a/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll
+++ b/llvm/test/CodeGen/X86/early-ifcvt-remarks.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc %s -x86-early-ifcvt -pass-remarks='early-ifcvt' -pass-remarks-missed='early-ifcvt' -mcpu=k8 -o - 2>&1 | FileCheck %s
 target triple = "x86_64-none-none"
 
@@ -6,7 +7,26 @@ target triple = "x86_64-none-none"
 ; CHECK-SAME: and the short leg adds another {{[0-9]+}} cycles{{s?}},
 ; CHECK-SAME: and the long leg adds another {{[0-9]+}} cycles{{s?}},
 ; CHECK-SAME: each staying under the threshold of {{[0-9]+}} cycles{{s?}}.
+
+; CHECK: remark: <unknown>:0:0: did not if-convert branch:
+; CHECK-SAME: the condition would add {{[0-9]+}} cycles{{s?}} to the critical path,
+; CHECK-SAME: and the short leg would add another {{[0-9]+}} cycles{{s?}},
+; CHECK-SAME: and the long leg would add another {{[0-9]+}} cycles{{s?}} exceeding the limit of {{[0-9]+}} cycles{{s?}}.
+
+; CHECK: remark: <unknown>:0:0: did not if-convert branch:
+; CHECK-SAME: the resulting critical path ({{[0-9]+}} cycles{{s?}})
+; CHECK-SAME: would extend the shorter leg's critical path ({{[0-9]+}} cycle{{s?}})
+; CHECK-SAME: by more than the threshold of {{[0-9]+}} cycles{{s?}},
+; CHECK-SAME: which cannot be hidden by available ILP.
+
 define i32 @mm1(i1 %pred, i32 %val) {
+; CHECK-LABEL: mm1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    leal 1(%rsi), %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovel %esi, %eax
+; CHECK-NEXT:    retq
 entry:
   br i1 %pred, label %if.true, label %if.else
 
@@ -19,11 +39,20 @@ if.else:
   ret i32 %res
 }
 
-; CHECK: remark: <unknown>:0:0: did not if-convert branch:
-; CHECK-SAME: the condition would add {{[0-9]+}} cycles{{s?}} to the critical path,
-; CHECK-SAME: and the short leg would add another {{[0-9]+}} cycles{{s?}},
-; CHECK-SAME: and the long leg would add another {{[0-9]+}} cycles{{s?}} exceeding the limit of {{[0-9]+}} cycles{{s?}}.
 define i32 @mm2(i1 %pred, i32 %val, i32 %e1, i32 %e2, i32 %e3, i32 %e4, i32 %e5) {
+; CHECK-LABEL: mm2:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    je .LBB1_2
+; CHECK-NEXT:  # %bb.1: # %if.true
+; CHECK-NEXT:    addl %eax, %edx
+; CHECK-NEXT:    addl %ecx, %r8d
+; CHECK-NEXT:    addl %edx, %r8d
+; CHECK-NEXT:    addl %r8d, %r9d
+; CHECK-NEXT:    movl %r9d, %eax
+; CHECK-NEXT:  .LBB1_2: # %if.else
+; CHECK-NEXT:    retq
 entry:
   br i1 %pred, label %if.true, label %if.else
 
@@ -39,12 +68,48 @@ if.else:
   ret i32 %res
 }
 
-; CHECK: did not if-convert branch:
-; CHECK-SAME: the resulting critical path ({{[0-9]+}} cycles{{s?}})
-; CHECK-SAME: would extend the shorter leg's critical path ({{[0-9]+}} cycle{{s?}})
-; CHECK-SAME: by more than the threshold of {{[0-9]+}} cycles{{s?}},
-; CHECK-SAME: which cannot be hidden by available ILP.
-define i32 @mm3(i1 %pred, i32 %val, i32 %e1, i128 %e2, i128 %e3, i128 %e4, i128 %e5) {
+define i64 @mm3(i1 %pred, i64 %val, i64 %e1, i128 %e2, i128 %e3, i128 %e4, i128 %e5) {
+; CHECK-LABEL: mm3:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    movq %rsi, %r10
+; CHECK-NEXT:    jne .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %if.false
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    imulq %rdx, %rdx
+; CHECK-NEXT:    movq %rdx, %r10
+; CHECK-NEXT:    sarq $63, %r10
+; CHECK-NEXT:    movq %rcx, %rax
+; CHECK-NEXT:    movq %rdx, %r9
+; CHECK-NEXT:    mulq %rdx
+; CHECK-NEXT:    imulq %rcx, %r10
+; CHECK-NEXT:    imulq %r9, %r8
+; CHECK-NEXT:    addq %rdx, %r8
+; CHECK-NEXT:    addq %r10, %r8
+; CHECK-NEXT:    addq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    adcq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    xorq %r8, %rdi
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; CHECK-NEXT:    xorq %rsi, %r10
+; CHECK-NEXT:    xorq %rax, %r10
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    sarq %cl, %rax
+; CHECK-NEXT:    addq %rdi, %rdi
+; CHECK-NEXT:    notb %cl
+; CHECK-NEXT:    shlq %cl, %rdi
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    shrq %cl, %r10
+; CHECK-NEXT:    orq %rdi, %r10
+; CHECK-NEXT:    testb $64, %sil
+; CHECK-NEXT:    cmovneq %rax, %r10
+; CHECK-NEXT:    movq %r9, %rax
+; CHECK-NEXT:  .LBB2_2: # %if.endif
+; CHECK-NEXT:    addq %r10, %rax
+; CHECK-NEXT:    retq
 entry:
   br i1 %pred, label %if.true, label %if.false
 
@@ -52,19 +117,19 @@ if.true:
   br label %if.endif
 
 if.false:
-  %f1 = mul i32 %e1, %e1
-  %f3 = sext i32 %f1 to i128
+  %f1 = mul i64 %e1, %e1
+  %f3 = sext i64 %f1 to i128
   %f4 = mul i128 %e2, %f3
   %f6 = add i128 %e3, %f4
   %f7 = xor i128 %e4, %f6
   %f8 = xor i128 %e5, %f7
   %a1 = ashr i128 %f8, %e5
-  %f5 = trunc i128 %a1 to i32
+  %f5 = trunc i128 %a1 to i64
   br label %if.endif
 
 if.endif:
-  %r1 = phi i32 [ %val, %if.true ], [ %f1, %if.false ]
-  %r2 = phi i32 [ %val, %if.true ], [ %f5, %if.false ]
-  %res = add i32 %r1, %r2
-  ret i32 %res
+  %r1 = phi i64 [ %val, %if.true ], [ %f1, %if.false ]
+  %r2 = phi i64 [ %val, %if.true ], [ %f5, %if.false ]
+  %res = add i64 %r1, %r2
+  ret i64 %res
 }
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index d2b45ee1e03e63..8c48e6f9da80a7 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -2758,6 +2758,58 @@ entry:
   ret float %result
 }
 
+; Verify that tan(42.0) isn't simplified when the rounding mode is unknown.
+define double @ftan() #0 {
+; X87-LABEL: ftan:
+; X87:       # %bb.0: # %entry
+; X87-NEXT:    subl $12, %esp
+; X87-NEXT:    .cfi_def_cfa_offset 16
+; X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
+; X87-NEXT:    calll tan
+; X87-NEXT:    addl $12, %esp
+; X87-NEXT:    .cfi_def_cfa_offset 4
+; X87-NEXT:    retl
+;
+; X86-SSE-LABEL: ftan:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    subl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; X86-SSE-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE-NEXT:    calll tan
+; X86-SSE-NEXT:    addl $12, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE-NEXT:    retl
+;
+; SSE-LABEL: ftan:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; SSE-NEXT:    callq tan@PLT
+; SSE-NEXT:    popq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: ftan:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %result = call double @llvm.experimental.constrained.tan.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret double %result
+}
+
+
 attributes #0 = { strictfp }
 
 @llvm.fp.env = thread_local global i8 zeroinitializer, section "llvm.metadata"
@@ -2771,6 +2823,7 @@ declare double @llvm.experimental.constrained.pow.f64(double, double, metadata,
 declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
 declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
index 1bc308bef8cccf..cfec52c0e68863 100644
--- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
@@ -160,6 +160,23 @@ define float @sin(float %x) #0 {
   ret float %result
 }
 
+define float @tan(float %x) #0 {
+; CHECK-LABEL: tan:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _tan
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.tan.f32(float %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
 attributes #0 = { strictfp }
 
 declare float @llvm.experimental.constrained.ceil.f32(float, metadata)
@@ -171,3 +188,4 @@ declare float @llvm.experimental.constrained.log.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.log10.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.pow.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.sin.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index f1d473f81a9fa1..bd51f553587db7 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -1047,6 +1047,46 @@ entry:
   ret fp128 %sqrt
 }
 
+define fp128 @tan(fp128 %x) nounwind strictfp {
+; ANDROID-LABEL: tan:
+; ANDROID:       # %bb.0: # %entry
+; ANDROID-NEXT:    pushq %rax
+; ANDROID-NEXT:    callq tanl@PLT
+; ANDROID-NEXT:    popq %rax
+; ANDROID-NEXT:    retq
+;
+; GNU-LABEL: tan:
+; GNU:       # %bb.0: # %entry
+; GNU-NEXT:    pushq %rax
+; GNU-NEXT:    callq tanf128@PLT
+; GNU-NEXT:    popq %rax
+; GNU-NEXT:    retq
+;
+; X86-LABEL: tan:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll tanl
+; X86-NEXT:    addl $28, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+entry:
+  %tan = call fp128 @llvm.experimental.constrained.tan.f128(fp128 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret fp128 %tan
+}
+
 define fp128 @trunc(fp128 %x) nounwind strictfp {
 ; ANDROID-LABEL: trunc:
 ; ANDROID:       # %bb.0: # %entry
@@ -1663,6 +1703,7 @@ declare fp128 @llvm.experimental.constrained.round.f128(fp128, metadata)
 declare fp128 @llvm.experimental.constrained.roundeven.f128(fp128, metadata)
 declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata)
 declare i32 @llvm.experimental.constrained.lrint.i32.f128(fp128, metadata, metadata)
 declare i64 @llvm.experimental.constrained.llrint.i64.f128(fp128, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
index 4d50b15e5c185b..89729975cfd61b 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
@@ -504,6 +504,31 @@ entry:
   ret x86_fp80 %sin
 }
 
+define x86_fp80 @tan(x86_fp80 %x) nounwind strictfp {
+; X86-LABEL: tan:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll tanl
+; X86-NEXT:    addl $12, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tan:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $24, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq tanl@PLT
+; X64-NEXT:    addq $24, %rsp
+; X64-NEXT:    retq
+entry:
+  %tan = call x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80 %x, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %tan
+}
+
 define x86_fp80 @trunc(x86_fp80 %x) nounwind strictfp {
 ; X86-LABEL: trunc:
 ; X86:       # %bb.0: # %entry
@@ -650,6 +675,7 @@ declare x86_fp80 @llvm.experimental.constrained.rint.f80(x86_fp80, metadata, met
 declare x86_fp80 @llvm.experimental.constrained.round.f80(x86_fp80, metadata)
 declare x86_fp80 @llvm.experimental.constrained.roundeven.f80(x86_fp80, metadata)
 declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
 declare i32 @llvm.experimental.constrained.lrint.i32.f80(x86_fp80, metadata, metadata)
 declare i64 @llvm.experimental.constrained.llrint.i64.f80(x86_fp80, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 2046d790cc57e4..999be0f98b6fc5 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -1,1336 +1,1414 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefix=CHECK-32
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=CHECK-64
+; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-GENERIC
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+ndd | FileCheck %s -check-prefixes=X64,X64-NDD
 
 define i1 @isnan_f(float %x) {
-; CHECK-32-LABEL: isnan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-64-NEXT:    setp %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    setp %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 3)  ; "nan"
   ret i1 %0
 }
 
 define i1 @isnot_nan_f(float %x) {
-; CHECK-32-LABEL: isnot_nan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setnp %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnot_nan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-64-NEXT:    setnp %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnot_nan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setnp %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnot_nan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    setnp %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1020)  ; 0x3fc = "zero|subnormal|normal|inf"
   ret i1 %0
 }
 
 define i1 @issignaling_f(float %x) {
-; CHECK-32-LABEL: issignaling_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %cl
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issignaling_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %cl
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    andb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issignaling_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issignaling_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setl %cl
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1)  ; "snan"
   ret i1 %0
 }
 
 define i1 @not_issignaling_f(float %x) {
-; CHECK-32-LABEL: not_issignaling_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issignaling_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issignaling_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setl %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issignaling_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %cl
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setl %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1022)  ; ~"snan"
   ret i1 %0
 }
 
 define i1 @isquiet_f(float %x) {
-; CHECK-32-LABEL: isquiet_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isquiet_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isquiet_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 2)  ; "qnan"
   ret i1 %0
 }
 
 define i1 @not_isquiet_f(float %x) {
-; CHECK-32-LABEL: not_isquiet_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isquiet_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isquiet_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isquiet_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1021)  ; ~"qnan"
   ret i1 %0
 }
 
 define i1 @isinf_f(float %x) {
-; CHECK-32-LABEL: isinf_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isinf_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isinf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; 0x204 = "inf"
   ret i1 %0
 }
 
 define i1 @not_isinf_f(float %x) {
-; CHECK-32-LABEL: not_isinf_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isinf_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isinf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isinf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; ~0x204 = "~inf"
   ret i1 %0
 }
 
 define i1 @is_plus_inf_f(float %x) {
-; CHECK-32-LABEL: is_plus_inf_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_inf_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_plus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 512)  ; 0x200 = "+inf"
   ret i1 %0
 }
 
 define i1 @is_minus_inf_f(float %x) {
-; CHECK-32-LABEL: is_minus_inf_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_minus_inf_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_minus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 4)  ; "-inf"
   ret i1 %0
 }
 
 define i1 @not_is_minus_inf_f(float %x) {
-; CHECK-32-LABEL: not_is_minus_inf_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_minus_inf_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_minus_inf_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-8388608, {{[0-9]+}}(%esp) # imm = 0xFF800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_minus_inf_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1019)  ; ~"-inf"
   ret i1 %0
 }
 
 define i1 @isfinite_f(float %x) {
-; CHECK-32-LABEL: isfinite_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isfinite_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isfinite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %0
 }
 
 define i1 @not_isfinite_f(float %x) {
-; CHECK-32-LABEL: not_isfinite_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isfinite_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isfinite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isfinite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; ~0x1f8 = "~finite"
   ret i1 %0
 }
 
 define i1 @is_plus_finite_f(float %x) {
-; CHECK-32-LABEL: is_plus_finite_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_finite_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_finite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_plus_finite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 448)  ; 0x1c0 = "+finite"
   ret i1 %0
 }
 
 define i1 @not_is_plus_finite_f(float %x) {
-; CHECK-32-LABEL: not_is_plus_finite_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    setae %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_plus_finite_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setae %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_plus_finite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_plus_finite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 575)  ; ~0x1c0 = ~"+finite"
   ret i1 %0
 }
 
 define i1 @is_minus_finite_f(float %x) {
-; CHECK-32-LABEL: is_minus_finite_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    sets %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_minus_finite_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sets %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    andb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_minus_finite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sets %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_minus_finite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sets %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 56)  ; 0x38 = "-finite"
   ret i1 %0
 }
 
 define i1 @not_is_minus_finite_f(float %x) {
-; CHECK-32-LABEL: not_is_minus_finite_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    setns %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_minus_finite_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    setns %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_minus_finite_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setns %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_minus_finite_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setns %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setge %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 967)  ; ~0x38 = ~"-finite"
   ret i1 %0
 }
 
 define i1 @isnormal_f(float %x) #1 {
-; CHECK-32-LABEL: isnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 264)  ; 0x108 = "normal"
   ret i1 %0
 }
 
 define i1 @not_isnormal_f(float %x) #1 {
-; CHECK-32-LABEL: not_isnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setae %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setae %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 759)  ; ~0x108 = "~normal"
   ret i1 %0
 }
 
 define i1 @is_plus_normal_f(float %x) {
-; CHECK-32-LABEL: is_plus_normal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    setns %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_normal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    setns %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    andb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_normal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setns %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setb %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_plus_normal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setns %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NEXT:    setb %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 256)  ; 0x100 = "+normal"
   ret i1 %0
 }
 
 define i1 @issubnormal_f(float %x) {
-; CHECK-32-LABEL: issubnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %0
 }
 
 define i1 @issubnormal_f_daz(float %x) #0 {
-; CHECK-32-LABEL: issubnormal_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %0
 }
 
 define i1 @issubnormal_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: issubnormal_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %0
 }
 
 define i1 @not_issubnormal_f(float %x) {
-; CHECK-32-LABEL: not_issubnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setae %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setae %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 879)  ; ~0x90 = "~subnormal"
   ret i1 %0
 }
 
 define i1 @not_issubnormal_f_daz(float %x) #0 {
-; CHECK-32-LABEL: not_issubnormal_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setae %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setae %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 879)  ; ~0x90 = "~subnormal"
   ret i1 %0
 }
 
 define i1 @not_issubnormal_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: not_issubnormal_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setae %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setae %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 879)  ; ~0x90 = "~subnormal"
   ret i1 %0
 }
 
 define i1 @is_plus_subnormal_f(float %x) {
-; CHECK-32-LABEL: is_plus_subnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_subnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_subnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_plus_subnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setb %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 128)  ; 0x80 = "+subnormal"
   ret i1 %0
 }
 
 define i1 @not_is_plus_subnormal_f(float %x) {
-; CHECK-32-LABEL: not_is_plus_subnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setae %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_plus_subnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setae %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_plus_subnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setae %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_plus_subnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setae %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 895)  ; ~0x80 = ~"+subnormal"
   ret i1 %0
 }
 
 define i1 @is_minus_subnormal_f(float %x) {
-; CHECK-32-LABEL: is_minus_subnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    sets %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_minus_subnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sets %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    andb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_minus_subnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sets %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setb %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_minus_subnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sets %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setb %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 16)  ; 0x10 = "-subnormal"
   ret i1 %0
 }
 
 define i1 @not_is_minus_subnormal_f(float %x) {
-; CHECK-32-LABEL: not_is_minus_subnormal_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    setns %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setae %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_minus_subnormal_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    setns %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setae %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_minus_subnormal_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setns %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setae %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_minus_subnormal_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setns %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setae %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1007)  ; ~0x10 = ~"-subnormal"
   ret i1 %0
 }
 
 define i1 @iszero_f(float %x) {
-; CHECK-32-LABEL: iszero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96)  ; 0x60 = "zero"
   ret i1 %0
 }
 
 define i1 @iszero_f_daz(float %x) #0 {
-; CHECK-32-LABEL: iszero_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96)  ; 0x60 = "zero"
   ret i1 %0
 }
 
 define i1 @iszero_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: iszero_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96)  ; 0x60 = "zero"
   ret i1 %0
 }
 
 define i1 @not_iszero_f(float %x) {
-; CHECK-32-LABEL: not_iszero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_iszero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927)  ; ~0x60 = "~zero"
   ret i1 %0
 }
 
 define i1 @not_iszero_f_daz(float %x) #0 {
-; CHECK-32-LABEL: not_iszero_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_iszero_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927)  ; ~0x60 = "~zero"
   ret i1 %0
 }
 
 define i1 @not_iszero_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: not_iszero_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_iszero_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927)  ; ~0x60 = "~zero"
   ret i1 %0
 }
 
 define i1 @issubnormal_or_zero_f(float %x) {
-; CHECK-32-LABEL: issubnormal_or_zero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_zero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_zero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_or_zero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 240)  ; 0xf0 = "subnormal|zero"
   ret i1 %0
 }
 
 define i1 @issubnormal_or_zero_f_daz(float %x) #0 {
-; CHECK-32-LABEL: issubnormal_or_zero_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_zero_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_zero_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_or_zero_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 240)  ; 0xf0 = "subnormal|zero"
   ret i1 %0
 }
 
 define i1 @issubnormal_or_zero_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: issubnormal_or_zero_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_zero_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_zero_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_or_zero_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 240)  ; 0xf0 = "subnormal|zero"
   ret i1 %0
 }
 
 define i1 @not_issubnormal_or_zero_f(float %x) {
-; CHECK-32-LABEL: not_issubnormal_or_zero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_zero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_zero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_or_zero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 783)  ; ~0xf0 = "~(subnormal|zero)"
   ret i1 %0
 }
 
 define i1 @not_issubnormal_or_zero_f_daz(float %x) #0 {
-; CHECK-32-LABEL: not_issubnormal_or_zero_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_zero_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_zero_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_or_zero_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 783)  ; ~0xf0 = "~(subnormal|zero)"
   ret i1 %0
 }
 
 define i1 @not_issubnormal_or_zero_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: not_issubnormal_or_zero_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_zero_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_zero_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2139095040, {{[0-9]+}}(%esp) # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_or_zero_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 783)  ; ~0xf0 = "~(subnormal|zero)"
   ret i1 %0
 }
 
 define i1 @is_plus_zero_f(float %x) {
-; CHECK-32-LABEL: is_plus_zero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_zero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_zero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_plus_zero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 64)  ; 0x40 = "+zero"
   ret i1 %0
 }
 
 define i1 @not_is_plus_zero_f(float %x) {
-; CHECK-32-LABEL: not_is_plus_zero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_plus_zero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_plus_zero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_plus_zero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 959)  ; ~0x40 = ~"+zero"
   ret i1 %0
 }
 
 define i1 @is_minus_zero_f(float %x) {
-; CHECK-32-LABEL: is_minus_zero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_minus_zero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-2147483648, %eax # imm = 0x80000000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_minus_zero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_minus_zero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 32)  ; 0x20 = "-zero"
   ret i1 %0
 }
 
 define i1 @not_is_minus_zero_f(float %x) {
-; CHECK-32-LABEL: not_is_minus_zero_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_minus_zero_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-2147483648, %eax # imm = 0x80000000
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_minus_zero_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    cmpl $-2147483648, {{[0-9]+}}(%esp) # imm = 0x80000000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_minus_zero_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-2147483648, %eax # imm = 0x80000000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 991)  ; ~0x20 = ~"-zero"
   ret i1 %0
 }
 
 define i1 @isnan_f_strictfp(float %x) strictfp {
-; CHECK-32-LABEL: isnan_f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnan_f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 3) strictfp ; "nan"
   ret i1 %0
 }
 
 define i1 @not_isnan_f_strictfp(float %x) strictfp {
-; CHECK-32-LABEL: not_isnan_f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isnan_f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isnan_f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isnan_f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1020) strictfp ; ~"nan"
   ret i1 %0
 }
 
 define i1 @isfinite_f_strictfp(float %x) strictfp {
-; CHECK-32-LABEL: isfinite_f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isfinite_f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isfinite_f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isfinite_f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504) strictfp ; 0x1f8 = "finite"
   ret i1 %0
 }
 
 define i1 @not_isfinite_f_strictfp(float %x) strictfp {
-; CHECK-32-LABEL: not_isfinite_f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isfinite_f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isfinite_f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isfinite_f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519) strictfp ; ~0x1f8 = ~"finite"
   ret i1 %0
 }
 
 define i1 @iszero_f_strictfp(float %x) strictfp {
-; CHECK-32-LABEL: iszero_f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 96) strictfp ; 0x60 = "zero"
   ret i1 %0
 }
 
 define i1 @not_iszero_f_strictfp(float %x) strictfp {
-; CHECK-32-LABEL: not_iszero_f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    testl $2147483647, {{[0-9]+}}(%esp) # imm = 0x7FFFFFFF
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_iszero_f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 927) strictfp ; ~0x60 = ~"zero"
   ret i1 %0
 }
 
 define i1 @isnan_d(double %x) {
-; CHECK-32-LABEL: isnan_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    fldl {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    ucomisd %xmm0, %xmm0
-; CHECK-64-NEXT:    setp %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    fldl {{[0-9]+}}(%esp)
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnan_d:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    ucomisd %xmm0, %xmm0
+; X64-NEXT:    setp %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 3)  ; "nan"
   ret i1 %0
 }
 
 define i1 @isinf_d(double %x) {
-; CHECK-32-LABEL: isinf_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    xorl $2146435072, %eax # imm = 0x7FF00000
-; CHECK-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isinf_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-64-NEXT:    andq %rax, %rcx
-; CHECK-64-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
-; CHECK-64-NEXT:    cmpq %rax, %rcx
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isinf_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl $2146435072, %eax # imm = 0x7FF00000
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: isinf_d:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movq %xmm0, %rax
+; X64-GENERIC-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    andq %rax, %rcx
+; X64-GENERIC-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
+; X64-GENERIC-NEXT:    cmpq %rax, %rcx
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: isinf_d:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movq %xmm0, %rax
+; X64-NDD-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NDD-NEXT:    andq %rcx, %rax
+; X64-NDD-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NDD-NEXT:    cmpq %rcx, %rax
+; X64-NDD-NEXT:    sete %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 516)  ; 0x204 = "inf"
   ret i1 %0
 }
 
 define i1 @isfinite_d(double %x) {
-; CHECK-32-LABEL: isfinite_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2146435072, %eax # imm = 0x7FF00000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isfinite_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-64-NEXT:    andq %rax, %rcx
-; CHECK-64-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
-; CHECK-64-NEXT:    cmpq %rax, %rcx
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isfinite_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2146435072, %eax # imm = 0x7FF00000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: isfinite_d:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movq %xmm0, %rax
+; X64-GENERIC-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    andq %rax, %rcx
+; X64-GENERIC-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
+; X64-GENERIC-NEXT:    cmpq %rax, %rcx
+; X64-GENERIC-NEXT:    setl %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: isfinite_d:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movq %xmm0, %rax
+; X64-NDD-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NDD-NEXT:    andq %rcx, %rax
+; X64-NDD-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NDD-NEXT:    cmpq %rcx, %rax
+; X64-NDD-NEXT:    setl %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 504)  ; 0x1f8 = "finite"
   ret i1 %0
 }
 
 define i1 @isnormal_d(double %x) {
-; CHECK-32-LABEL: isnormal_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    addl $-1048576, %eax # imm = 0xFFF00000
-; CHECK-32-NEXT:    shrl $21, %eax
-; CHECK-32-NEXT:    cmpl $1023, %eax # imm = 0x3FF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnormal_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-64-NEXT:    andq %rax, %rcx
-; CHECK-64-NEXT:    movabsq $-4503599627370496, %rax # imm = 0xFFF0000000000000
-; CHECK-64-NEXT:    addq %rcx, %rax
-; CHECK-64-NEXT:    shrq $53, %rax
-; CHECK-64-NEXT:    cmpl $1023, %eax # imm = 0x3FF
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnormal_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl $-1048576, %eax # imm = 0xFFF00000
+; X86-NEXT:    shrl $21, %eax
+; X86-NEXT:    cmpl $1023, %eax # imm = 0x3FF
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: isnormal_d:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movq %xmm0, %rax
+; X64-GENERIC-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    andq %rax, %rcx
+; X64-GENERIC-NEXT:    movabsq $-4503599627370496, %rax # imm = 0xFFF0000000000000
+; X64-GENERIC-NEXT:    addq %rcx, %rax
+; X64-GENERIC-NEXT:    shrq $53, %rax
+; X64-GENERIC-NEXT:    cmpl $1023, %eax # imm = 0x3FF
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: isnormal_d:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movq %xmm0, %rax
+; X64-NDD-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NDD-NEXT:    andq %rcx, %rax
+; X64-NDD-NEXT:    movabsq $-4503599627370496, %rcx # imm = 0xFFF0000000000000
+; X64-NDD-NEXT:    addq %rcx, %rax
+; X64-NDD-NEXT:    shrq $53, %rax
+; X64-NDD-NEXT:    cmpl $1023, %eax # imm = 0x3FF
+; X64-NDD-NEXT:    setb %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 264)  ; 0x108 = "normal"
   ret i1 %0
 }
 
 define i1 @issubnormal_d(double %x) {
-; CHECK-32-LABEL: issubnormal_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    addl $-1, %eax
-; CHECK-32-NEXT:    adcl $-1, %ecx
-; CHECK-32-NEXT:    cmpl $-1, %eax
-; CHECK-32-NEXT:    sbbl $1048575, %ecx # imm = 0xFFFFF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-64-NEXT:    andq %rax, %rcx
-; CHECK-64-NEXT:    decq %rcx
-; CHECK-64-NEXT:    movabsq $4503599627370495, %rax # imm = 0xFFFFFFFFFFFFF
-; CHECK-64-NEXT:    cmpq %rax, %rcx
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl $-1, %eax
+; X86-NEXT:    adcl $-1, %ecx
+; X86-NEXT:    cmpl $-1, %eax
+; X86-NEXT:    sbbl $1048575, %ecx # imm = 0xFFFFF
+; X86-NEXT:    setb %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: issubnormal_d:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movq %xmm0, %rax
+; X64-GENERIC-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    andq %rax, %rcx
+; X64-GENERIC-NEXT:    decq %rcx
+; X64-GENERIC-NEXT:    movabsq $4503599627370495, %rax # imm = 0xFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    cmpq %rax, %rcx
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: issubnormal_d:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movq %xmm0, %rax
+; X64-NDD-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NDD-NEXT:    andq %rcx, %rax
+; X64-NDD-NEXT:    decq %rax
+; X64-NDD-NEXT:    movabsq $4503599627370495, %rcx # imm = 0xFFFFFFFFFFFFF
+; X64-NDD-NEXT:    cmpq %rcx, %rax
+; X64-NDD-NEXT:    setb %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 144)  ; 0x90 = "subnormal"
   ret i1 %0
 }
 
 define i1 @iszero_d(double %x) {
-; CHECK-32-LABEL: iszero_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    shlq %rax
-; CHECK-64-NEXT:    testq %rax, %rax
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_d:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    shlq %rax
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 96)  ; 0x60 = "zero"
   ret i1 %0
 }
 
 define i1 @issignaling_d(double %x) {
-; CHECK-32-LABEL: issignaling_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    xorl %ecx, %ecx
-; CHECK-32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    movl $2146435072, %ecx # imm = 0x7FF00000
-; CHECK-32-NEXT:    sbbl %eax, %ecx
-; CHECK-32-NEXT:    setl %cl
-; CHECK-32-NEXT:    cmpl $2146959360, %eax # imm = 0x7FF80000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issignaling_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-64-NEXT:    andq %rax, %rcx
-; CHECK-64-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
-; CHECK-64-NEXT:    cmpq %rax, %rcx
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
-; CHECK-64-NEXT:    cmpq %rax, %rcx
-; CHECK-64-NEXT:    setg %al
-; CHECK-64-NEXT:    andb %dl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issignaling_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $2146435072, %ecx # imm = 0x7FF00000
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2146959360, %eax # imm = 0x7FF80000
+; X86-NEXT:    setl %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: issignaling_d:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movq %xmm0, %rax
+; X64-GENERIC-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    andq %rax, %rcx
+; X64-GENERIC-NEXT:    movabsq $9221120237041090560, %rax # imm = 0x7FF8000000000000
+; X64-GENERIC-NEXT:    cmpq %rax, %rcx
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
+; X64-GENERIC-NEXT:    cmpq %rax, %rcx
+; X64-GENERIC-NEXT:    setg %al
+; X64-GENERIC-NEXT:    andb %dl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: issignaling_d:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movq %xmm0, %rax
+; X64-NDD-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NDD-NEXT:    andq %rcx, %rax
+; X64-NDD-NEXT:    movabsq $9221120237041090560, %rcx # imm = 0x7FF8000000000000
+; X64-NDD-NEXT:    cmpq %rcx, %rax
+; X64-NDD-NEXT:    setl %cl
+; X64-NDD-NEXT:    movabsq $9218868437227405312, %rdx # imm = 0x7FF0000000000000
+; X64-NDD-NEXT:    cmpq %rdx, %rax
+; X64-NDD-NEXT:    setg %al
+; X64-NDD-NEXT:    andb %cl, %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 1)  ; "snan"
   ret i1 %0
 }
 
 define i1 @isquiet_d(double %x) {
-; CHECK-32-LABEL: isquiet_d:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2146959360, %eax # imm = 0x7FF80000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isquiet_d:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-64-NEXT:    andq %rax, %rcx
-; CHECK-64-NEXT:    movabsq $9221120237041090559, %rax # imm = 0x7FF7FFFFFFFFFFFF
-; CHECK-64-NEXT:    cmpq %rax, %rcx
-; CHECK-64-NEXT:    setg %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isquiet_d:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2146959360, %eax # imm = 0x7FF80000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: isquiet_d:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movq %xmm0, %rax
+; X64-GENERIC-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    andq %rax, %rcx
+; X64-GENERIC-NEXT:    movabsq $9221120237041090559, %rax # imm = 0x7FF7FFFFFFFFFFFF
+; X64-GENERIC-NEXT:    cmpq %rax, %rcx
+; X64-GENERIC-NEXT:    setg %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: isquiet_d:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movq %xmm0, %rax
+; X64-NDD-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NDD-NEXT:    andq %rcx, %rax
+; X64-NDD-NEXT:    movabsq $9221120237041090559, %rcx # imm = 0x7FF7FFFFFFFFFFFF
+; X64-NDD-NEXT:    cmpq %rcx, %rax
+; X64-NDD-NEXT:    setg %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 2)  ; "qnan"
   ret i1 %0
 }
 
 define i1 @isnan_d_strictfp(double %x) strictfp {
-; CHECK-32-LABEL: isnan_d_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    xorl %ecx, %ecx
-; CHECK-32-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    movl $2146435072, %ecx # imm = 0x7FF00000
-; CHECK-32-NEXT:    sbbl %eax, %ecx
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_d_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
-; CHECK-64-NEXT:    andq %rax, %rcx
-; CHECK-64-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
-; CHECK-64-NEXT:    cmpq %rax, %rcx
-; CHECK-64-NEXT:    setg %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_d_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $2146435072, %ecx # imm = 0x7FF00000
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: isnan_d_strictfp:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movq %xmm0, %rax
+; X64-GENERIC-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-GENERIC-NEXT:    andq %rax, %rcx
+; X64-GENERIC-NEXT:    movabsq $9218868437227405312, %rax # imm = 0x7FF0000000000000
+; X64-GENERIC-NEXT:    cmpq %rax, %rcx
+; X64-GENERIC-NEXT:    setg %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: isnan_d_strictfp:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movq %xmm0, %rax
+; X64-NDD-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; X64-NDD-NEXT:    andq %rcx, %rax
+; X64-NDD-NEXT:    movabsq $9218868437227405312, %rcx # imm = 0x7FF0000000000000
+; X64-NDD-NEXT:    cmpq %rcx, %rax
+; X64-NDD-NEXT:    setg %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 3) strictfp ; "nan"
   ret i1 %0
 }
 
 define i1 @iszero_d_strictfp(double %x) strictfp {
-; CHECK-32-LABEL: iszero_d_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_d_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movq %xmm0, %rax
-; CHECK-64-NEXT:    shlq %rax
-; CHECK-64-NEXT:    testq %rax, %rax
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_d_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_d_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movq %xmm0, %rax
+; X64-NEXT:    shlq %rax
+; X64-NEXT:    testq %rax, %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f64(double %x, i32 96) strictfp ; 0x60 = "zero"
   ret i1 %0
@@ -1339,70 +1417,70 @@ entry:
 
 
 define <1 x i1> @isnan_v1f(<1 x float> %x) {
-; CHECK-32-LABEL: isnan_v1f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_v1f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    ucomiss %xmm0, %xmm0
-; CHECK-64-NEXT:    setp %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_v1f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnan_v1f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    ucomiss %xmm0, %xmm0
+; X64-NEXT:    setp %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call <1 x i1> @llvm.is.fpclass.v1f32(<1 x float> %x, i32 3)  ; "nan"
   ret <1 x i1> %0
 }
 
 define <1 x i1> @isnan_v1f_strictfp(<1 x float> %x) strictfp {
-; CHECK-32-LABEL: isnan_v1f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_v1f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_v1f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnan_v1f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call <1 x i1> @llvm.is.fpclass.v1f32(<1 x float> %x, i32 3) strictfp ; "nan"
   ret <1 x i1> %0
 }
 
 define <2 x i1> @isnan_v2f(<2 x float> %x) {
-; CHECK-32-LABEL: isnan_v2f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %cl
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %dl
-; CHECK-32-NEXT:    movl %ecx, %eax
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_v2f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    cmpunordps %xmm0, %xmm0
-; CHECK-64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_v2f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %cl
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %dl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnan_v2f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cmpunordps %xmm0, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT:    retq
 entry:
   %0 = tail call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 3)  ; "nan"
   ret <2 x i1> %0
@@ -1410,173 +1488,173 @@ entry:
 
 
 define <2 x i1> @isnot_nan_v2f(<2 x float> %x) {
-; CHECK-32-LABEL: isnot_nan_v2f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setnp %cl
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setnp %dl
-; CHECK-32-NEXT:    movl %ecx, %eax
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnot_nan_v2f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    cmpordps %xmm0, %xmm0
-; CHECK-64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnot_nan_v2f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setnp %cl
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setnp %dl
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnot_nan_v2f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cmpordps %xmm0, %xmm0
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT:    retq
 entry:
   %0 = tail call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 1020)  ; 0x3fc = "zero|subnormal|normal|inf"
   ret <2 x i1> %0
 }
 
 define <2 x i1> @isnan_v2f_strictfp(<2 x float> %x) strictfp {
-; CHECK-32-LABEL: isnan_v2f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    andl %ecx, %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dl
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isnan_v2f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; CHECK-64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-64-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_v2f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X86-NEXT:    setge %dl
+; X86-NEXT:    retl
+;
+; X64-LABEL: isnan_v2f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
+; X64-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = tail call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> %x, i32 3) strictfp ; "nan"
   ret <2 x i1> %0
 }
 
 define <4 x i1> @isnan_v4f(<4 x float> %x) {
-; CHECK-32-LABEL: isnan_v4f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %dh
-; CHECK-32-NEXT:    shlb $2, %dh
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %dl
-; CHECK-32-NEXT:    shlb $3, %dl
-; CHECK-32-NEXT:    orb %dh, %dl
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %dh
-; CHECK-32-NEXT:    fucomp %st(0)
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setp %al
-; CHECK-32-NEXT:    addb %al, %al
-; CHECK-32-NEXT:    orb %dh, %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    movb %al, (%ecx)
-; CHECK-32-NEXT:    movl %ecx, %eax
-; CHECK-32-NEXT:    retl $4
-;
-; CHECK-64-LABEL: isnan_v4f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    cmpunordps %xmm0, %xmm0
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_v4f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %dh
+; X86-NEXT:    shlb $2, %dh
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %dl
+; X86-NEXT:    shlb $3, %dl
+; X86-NEXT:    orb %dh, %dl
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %dh
+; X86-NEXT:    fucomp %st(0)
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setp %al
+; X86-NEXT:    addb %al, %al
+; X86-NEXT:    orb %dh, %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    movb %al, (%ecx)
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: isnan_v4f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    cmpunordps %xmm0, %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = tail call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %x, i32 3)  ; "nan"
   ret <4 x i1> %0
 }
 
 define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp {
-; CHECK-32-LABEL: isnan_v4f_strictfp:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    pushl %esi
-; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-32-NEXT:    .cfi_offset %esi, -8
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; CHECK-32-NEXT:    andl %ecx, %edx
-; CHECK-32-NEXT:    cmpl $2139095041, %edx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dh
-; CHECK-32-NEXT:    shlb $2, %dh
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-32-NEXT:    andl %ecx, %esi
-; CHECK-32-NEXT:    cmpl $2139095041, %esi # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dl
-; CHECK-32-NEXT:    shlb $3, %dl
-; CHECK-32-NEXT:    orb %dh, %dl
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; CHECK-32-NEXT:    andl %ecx, %esi
-; CHECK-32-NEXT:    cmpl $2139095041, %esi # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dh
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %ecx
-; CHECK-32-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    addb %cl, %cl
-; CHECK-32-NEXT:    orb %dh, %cl
-; CHECK-32-NEXT:    orb %dl, %cl
-; CHECK-32-NEXT:    movb %cl, (%eax)
-; CHECK-32-NEXT:    popl %esi
-; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-32-NEXT:    retl $4
-;
-; CHECK-64-LABEL: isnan_v4f_strictfp:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-64-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isnan_v4f_strictfp:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    andl %ecx, %edx
+; X86-NEXT:    cmpl $2139095041, %edx # imm = 0x7F800001
+; X86-NEXT:    setge %dh
+; X86-NEXT:    shlb $2, %dh
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    cmpl $2139095041, %esi # imm = 0x7F800001
+; X86-NEXT:    setge %dl
+; X86-NEXT:    shlb $3, %dl
+; X86-NEXT:    orb %dh, %dl
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %ecx, %esi
+; X86-NEXT:    cmpl $2139095041, %esi # imm = 0x7F800001
+; X86-NEXT:    setge %dh
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X86-NEXT:    setge %cl
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    orb %dh, %cl
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    movb %cl, (%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl $4
+;
+; X64-LABEL: isnan_v4f_strictfp:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-NEXT:    retq
 entry:
   %0 = tail call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %x, i32 3) strictfp ; "nan"
   ret <4 x i1> %0
 }
 
 define i1 @isnone_f(float %x) {
-; CHECK-32-LABEL: isnone_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    xorl %eax, %eax
-; CHECK-32-NEXT:    retl
+; X86-LABEL: isnone_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
 ;
-; CHECK-64-LABEL: isnone_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    xorl %eax, %eax
-; CHECK-64-NEXT:    retq
+; X64-LABEL: isnone_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 0)
   ret i1 %0
 }
 
 define i1 @isany_f(float %x) {
-; CHECK-32-LABEL: isany_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movb $1, %al
-; CHECK-32-NEXT:    retl
+; X86-LABEL: isany_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
 ;
-; CHECK-64-LABEL: isany_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movb $1, %al
-; CHECK-64-NEXT:    retq
+; X64-LABEL: isany_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1023)
   ret i1 %0
@@ -1584,1056 +1662,1263 @@ entry:
 
 
 define i1 @iszero_or_nan_f(float %x) {
-; CHECK-32-LABEL: iszero_or_nan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fldz
-; CHECK-32-NEXT:    fucompp
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_or_nan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    xorps %xmm1, %xmm1
-; CHECK-64-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_or_nan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fldz
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_or_nan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 99)  ; 0x60|0x3 = "zero|nan"
   ret i1 %0
 }
 
 define i1 @iszero_or_nan_f_daz(float %x) #0 {
-; CHECK-32-LABEL: iszero_or_nan_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_or_nan_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_or_nan_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %cl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_or_nan_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %cl
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 99)  ; 0x60|0x3 = "zero|nan"
   ret i1 %0
 }
 
 define i1 @iszero_or_nan_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: iszero_or_nan_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_or_nan_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_or_nan_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %cl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_or_nan_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %cl
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 99)  ; 0x60|0x3 = "zero|nan"
   ret i1 %0
 }
 
 define i1 @not_iszero_or_nan_f(float %x) {
-; CHECK-32-LABEL: not_iszero_or_nan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fldz
-; CHECK-32-NEXT:    fucompp
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_or_nan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    xorps %xmm1, %xmm1
-; CHECK-64-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_or_nan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fldz
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_iszero_or_nan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 924)  ; ~0x60 = "~(zero|nan)"
   ret i1 %0
 }
 
 define i1 @not_iszero_or_nan_f_daz(float %x) #0 {
-; CHECK-32-LABEL: not_iszero_or_nan_f_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setl %cl
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_or_nan_f_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setl %cl
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    andb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_or_nan_f_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setl %cl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_iszero_or_nan_f_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setl %cl
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
   ret i1 %0
 }
 
 define i1 @not_iszero_or_nan_f_maybe_daz(float %x) #1 {
-; CHECK-32-LABEL: not_iszero_or_nan_f_maybe_daz:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setl %cl
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_or_nan_f_maybe_daz:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setl %cl
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    andb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_or_nan_f_maybe_daz:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setl %cl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_iszero_or_nan_f_maybe_daz:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setl %cl
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 924)  ; ~(0x60|0x3) = "~(zero|nan)"
   ret i1 %0
 }
 
 define i1 @iszero_or_qnan_f(float %x) {
-; CHECK-32-LABEL: iszero_or_qnan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_or_qnan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_or_qnan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %cl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: iszero_or_qnan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %cl
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 98)  ; 0x60|0x2 = "zero|qnan"
   ret i1 %0
 }
 
 define i1 @iszero_or_snan_f(float %x) {
-; CHECK-32-LABEL: iszero_or_snan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %cl
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dl
-; CHECK-32-NEXT:    andb %cl, %dl
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: iszero_or_snan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %cl
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %dl
-; CHECK-64-NEXT:    andb %cl, %dl
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: iszero_or_snan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %dl
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: iszero_or_snan_f:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %cl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %dl
+; X64-GENERIC-NEXT:    andb %cl, %dl
+; X64-GENERIC-NEXT:    testl %eax, %eax
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: iszero_or_snan_f:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %cl
+; X64-NDD-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %dl
+; X64-NDD-NEXT:    andb %dl, %cl
+; X64-NDD-NEXT:    testl %eax, %eax
+; X64-NDD-NEXT:    sete %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 97)  ; 0x60|0x1 = "zero|snan"
   ret i1 %0
 }
 
 define i1 @not_iszero_or_qnan_f(float %x) {
-; CHECK-32-LABEL: not_iszero_or_qnan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    pushl %esi
-; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
-; CHECK-32-NEXT:    .cfi_offset %esi, -8
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %cl
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dl
-; CHECK-32-NEXT:    andb %cl, %dl
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    leal -1(%eax), %esi
-; CHECK-32-NEXT:    cmpl $8388607, %esi # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %ch
-; CHECK-32-NEXT:    orb %cl, %ch
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    orb %ch, %al
-; CHECK-32-NEXT:    popl %esi
-; CHECK-32-NEXT:    .cfi_def_cfa_offset 4
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_or_qnan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %cl
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %dl
-; CHECK-64-NEXT:    andb %cl, %dl
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    leal -1(%rax), %esi
-; CHECK-64-NEXT:    cmpl $8388607, %esi # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %sil
-; CHECK-64-NEXT:    orb %cl, %sil
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    orb %sil, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_or_qnan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %dl
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    leal -1(%eax), %esi
+; X86-NEXT:    cmpl $8388607, %esi # imm = 0x7FFFFF
+; X86-NEXT:    setb %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setb %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    orb %ch, %al
+; X86-NEXT:    popl %esi
+; X86-NEXT:    .cfi_def_cfa_offset 4
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_iszero_or_qnan_f:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %cl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %dl
+; X64-GENERIC-NEXT:    andb %cl, %dl
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %cl
+; X64-GENERIC-NEXT:    leal -1(%rax), %esi
+; X64-GENERIC-NEXT:    cmpl $8388607, %esi # imm = 0x7FFFFF
+; X64-GENERIC-NEXT:    setb %sil
+; X64-GENERIC-NEXT:    orb %cl, %sil
+; X64-GENERIC-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    orb %sil, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_iszero_or_qnan_f:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %cl
+; X64-NDD-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %dl
+; X64-NDD-NEXT:    andb %dl, %cl
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %dl
+; X64-NDD-NEXT:    decl %eax, %esi
+; X64-NDD-NEXT:    cmpl $8388607, %esi # imm = 0x7FFFFF
+; X64-NDD-NEXT:    setb %sil
+; X64-NDD-NEXT:    orb %sil, %dl
+; X64-NDD-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NDD-NEXT:    setb %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    orb %dl, %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 925)  ; ~(0x60|0x2) = "~(zero|qnan)"
   ret i1 %0
 }
 
 define i1 @not_iszero_or_snan_f(float %x) {
-; CHECK-32-LABEL: not_iszero_or_snan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    leal -1(%eax), %edx
-; CHECK-32-NEXT:    cmpl $8388607, %edx # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %dl
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %ch
-; CHECK-32-NEXT:    orb %cl, %ch
-; CHECK-32-NEXT:    orb %dl, %ch
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    orb %ch, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_iszero_or_snan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    leal -1(%rax), %edx
-; CHECK-64-NEXT:    cmpl $8388607, %edx # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %dl
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %sil
-; CHECK-64-NEXT:    orb %cl, %sil
-; CHECK-64-NEXT:    orb %dl, %sil
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    orb %sil, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_iszero_or_snan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    leal -1(%eax), %edx
+; X86-NEXT:    cmpl $8388607, %edx # imm = 0x7FFFFF
+; X86-NEXT:    setb %dl
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %ch
+; X86-NEXT:    orb %cl, %ch
+; X86-NEXT:    orb %dl, %ch
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setb %al
+; X86-NEXT:    orb %ch, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_iszero_or_snan_f:
+; X64-GENERIC:       # %bb.0: # %entry
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %cl
+; X64-GENERIC-NEXT:    leal -1(%rax), %edx
+; X64-GENERIC-NEXT:    cmpl $8388607, %edx # imm = 0x7FFFFF
+; X64-GENERIC-NEXT:    setb %dl
+; X64-GENERIC-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setge %sil
+; X64-GENERIC-NEXT:    orb %cl, %sil
+; X64-GENERIC-NEXT:    orb %dl, %sil
+; X64-GENERIC-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    orb %sil, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_iszero_or_snan_f:
+; X64-NDD:       # %bb.0: # %entry
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %cl
+; X64-NDD-NEXT:    decl %eax, %edx
+; X64-NDD-NEXT:    cmpl $8388607, %edx # imm = 0x7FFFFF
+; X64-NDD-NEXT:    setb %dl
+; X64-NDD-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NDD-NEXT:    setge %sil
+; X64-NDD-NEXT:    orb %sil, %cl
+; X64-NDD-NEXT:    orb %dl, %cl
+; X64-NDD-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NDD-NEXT:    setb %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 926)  ; ~(0x60|0x1) = "~(zero|snan)"
   ret i1 %0
 }
 
 define i1 @isinf_or_nan_f(float %x) {
-; CHECK-32-LABEL: isinf_or_nan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isinf_or_nan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isinf_or_nan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setge %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isinf_or_nan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setge %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 519)  ; 0x204|0x3 = "inf|nan"
   ret i1 %0
 }
 
 define i1 @not_isinf_or_nan_f(float %x) {
-; CHECK-32-LABEL: not_isinf_or_nan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isinf_or_nan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isinf_or_nan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isinf_or_nan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 504)  ; ~(0x204|0x3) = "~(inf|nan)"
   ret i1 %0
 }
 
 define i1 @isfinite_or_nan_f(float %x) {
-; CHECK-32-LABEL: isfinite_or_nan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: isfinite_or_nan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: isfinite_or_nan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: isfinite_or_nan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 507)  ; 0x1f8|0x3 = "finite|nan"
   ret i1 %0
 }
 
 define i1 @not_isfinite_or_nan_f(float %x) {
-; CHECK-32-LABEL: not_isfinite_or_nan_f:
-; CHECK-32:       # %bb.0: # %entry
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_isfinite_or_nan_f:
-; CHECK-64:       # %bb.0: # %entry
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_isfinite_or_nan_f:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_isfinite_or_nan_f:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
 entry:
   %0 = tail call i1 @llvm.is.fpclass.f32(float %x, i32 516)  ; ~(0x1f8|0x3) = "~(finite|nan)"
   ret i1 %0
 }
 
 define i1 @is_plus_inf_or_nan_f(float %x) {
-; CHECK-32-LABEL: is_plus_inf_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_inf_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_inf_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_plus_inf_or_nan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 515)  ; 0x200|0x3 = "+inf|nan"
   ret i1 %class
 }
 
 define i1 @is_minus_inf_or_nan_f(float %x) {
-; CHECK-32-LABEL: is_minus_inf_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_minus_inf_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_minus_inf_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_minus_inf_or_nan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 7)  ; "-inf|nan"
   ret i1 %class
 }
 
 define i1 @not_is_plus_inf_or_nan_f(float %x) {
-; CHECK-32-LABEL: not_is_plus_inf_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_plus_inf_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_plus_inf_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_plus_inf_or_nan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 508)  ; ~(0x200|0x3) = "~(+inf|nan)"
   ret i1 %class
 }
 
 define i1 @not_is_minus_inf_or_nan_f(float %x) {
-; CHECK-32-LABEL: not_is_minus_inf_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_minus_inf_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_minus_inf_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_is_minus_inf_or_nan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setl %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1016)  ; "~(-inf|nan)"
   ret i1 %class
 }
 
 define i1 @is_plus_inf_or_snan_f(float %x) {
-; CHECK-32-LABEL: is_plus_inf_or_snan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl %eax, %ecx
-; CHECK-32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %dl
-; CHECK-32-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    andb %dl, %cl
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_inf_or_snan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    movl %eax, %ecx
-; CHECK-64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    andb %dl, %cl
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_inf_or_snan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X86-NEXT:    setl %dl
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X86-NEXT:    setge %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: is_plus_inf_or_snan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    movl %eax, %ecx
+; X64-GENERIC-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %cl
+; X64-GENERIC-NEXT:    andb %dl, %cl
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    orb %cl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: is_plus_inf_or_snan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %dl
+; X64-NDD-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %cl
+; X64-NDD-NEXT:    andb %dl, %cl
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 513)  ; 0x200|0x1 = "+inf|snan"
   ret i1 %class
 }
 
 define i1 @is_plus_inf_or_qnan_f(float %x) {
-; CHECK-32-LABEL: is_plus_inf_or_qnan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_plus_inf_or_qnan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_plus_inf_or_qnan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_plus_inf_or_qnan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 514)  ; 0x200|0x1 = "+inf|qnan"
   ret i1 %class
 }
 
 define i1 @not_is_plus_inf_or_snan_f(float %x) {
-; CHECK-32-LABEL: not_is_plus_inf_or_snan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %dl
-; CHECK-32-NEXT:    orb %cl, %dl
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_plus_inf_or_snan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    orb %cl, %dl
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_plus_inf_or_snan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %dl
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_is_plus_inf_or_snan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    sete %cl
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    orb %cl, %dl
+; X64-GENERIC-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setge %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_is_plus_inf_or_snan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    sete %cl
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    setl %dl
+; X64-NDD-NEXT:    orb %dl, %cl
+; X64-NDD-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NDD-NEXT:    setge %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 510) ; ~(+inf|snan)
   ret i1 %class
 }
 
 define i1 @not_is_plus_inf_or_qnan_f(float %x) {
-; CHECK-32-LABEL: not_is_plus_inf_or_qnan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl %eax, %ecx
-; CHECK-32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %dl
-; CHECK-32-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dh
-; CHECK-32-NEXT:    andb %dl, %dh
-; CHECK-32-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    sete %dl
-; CHECK-32-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    orb %dh, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_plus_inf_or_qnan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    movl %eax, %ecx
-; CHECK-64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %sil
-; CHECK-64-NEXT:    andb %dl, %sil
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    sete %dl
-; CHECK-64-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    orb %sil, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_plus_inf_or_qnan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X86-NEXT:    setl %dl
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X86-NEXT:    setge %dh
+; X86-NEXT:    andb %dl, %dh
+; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    sete %dl
+; X86-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    orb %dh, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_is_plus_inf_or_qnan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    movl %eax, %ecx
+; X64-GENERIC-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %sil
+; X64-GENERIC-NEXT:    andb %dl, %sil
+; X64-GENERIC-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    sete %dl
+; X64-GENERIC-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-GENERIC-NEXT:    setl %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    orb %sil, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_is_plus_inf_or_qnan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %dl
+; X64-NDD-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %sil
+; X64-NDD-NEXT:    andb %sil, %dl
+; X64-NDD-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    sete %al
+; X64-NDD-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-NDD-NEXT:    setl %cl
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    orb %dl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 509) ; ~(+inf|qnan)
   ret i1 %class
 }
 
 define i1 @is_minus_inf_or_snan_f(float %x) {
-; CHECK-32-LABEL: is_minus_inf_or_snan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl %eax, %ecx
-; CHECK-32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %dl
-; CHECK-32-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    andb %dl, %cl
-; CHECK-32-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_minus_inf_or_snan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    movl %eax, %ecx
-; CHECK-64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    andb %dl, %cl
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_minus_inf_or_snan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X86-NEXT:    setl %dl
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X86-NEXT:    setge %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: is_minus_inf_or_snan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    movl %eax, %ecx
+; X64-GENERIC-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %cl
+; X64-GENERIC-NEXT:    andb %dl, %cl
+; X64-GENERIC-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    orb %cl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: is_minus_inf_or_snan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %dl
+; X64-NDD-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %cl
+; X64-NDD-NEXT:    andb %dl, %cl
+; X64-NDD-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    sete %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 5)  ; "-inf|snan"
   ret i1 %class
 }
 
 define i1 @is_minus_inf_or_qnan_f(float %x) {
-; CHECK-32-LABEL: is_minus_inf_or_qnan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: is_minus_inf_or_qnan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: is_minus_inf_or_qnan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: is_minus_inf_or_qnan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    cmpl $-8388608, %eax # imm = 0xFF800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 6)  ; "-inf|qnan"
   ret i1 %class
 }
 
 define i1 @not_is_minus_inf_or_snan_f(float %x) {
-; CHECK-32-LABEL: not_is_minus_inf_or_snan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %dl
-; CHECK-32-NEXT:    orb %cl, %dl
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_minus_inf_or_snan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    orb %cl, %dl
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_minus_inf_or_snan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setl %dl
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_is_minus_inf_or_snan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %cl
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    orb %cl, %dl
+; X64-GENERIC-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setge %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_is_minus_inf_or_snan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %cl
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    setl %dl
+; X64-NDD-NEXT:    orb %dl, %cl
+; X64-NDD-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NDD-NEXT:    setge %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1018)  ; "~(-inf|snan)"
   ret i1 %class
 }
 
 define i1 @not_is_minus_inf_or_qnan_f(float %x) {
-; CHECK-32-LABEL: not_is_minus_inf_or_qnan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl %eax, %ecx
-; CHECK-32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %dl
-; CHECK-32-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dh
-; CHECK-32-NEXT:    andb %dl, %dh
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %dl
-; CHECK-32-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    orb %dh, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_is_minus_inf_or_qnan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    movl %eax, %ecx
-; CHECK-64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %sil
-; CHECK-64-NEXT:    andb %dl, %sil
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %dl
-; CHECK-64-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    orb %sil, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_is_minus_inf_or_qnan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X86-NEXT:    setl %dl
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X86-NEXT:    setge %dh
+; X86-NEXT:    andb %dl, %dh
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %dl
+; X86-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X86-NEXT:    setl %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    orb %dh, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_is_minus_inf_or_qnan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    movl %eax, %ecx
+; X64-GENERIC-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %sil
+; X64-GENERIC-NEXT:    andb %dl, %sil
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %dl
+; X64-GENERIC-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-GENERIC-NEXT:    setl %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    orb %sil, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_is_minus_inf_or_qnan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %dl
+; X64-NDD-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %sil
+; X64-NDD-NEXT:    andb %sil, %dl
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %al
+; X64-NDD-NEXT:    cmpl $2139095040, %ecx # imm = 0x7F800000
+; X64-NDD-NEXT:    setl %cl
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    orb %dl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 1017)  ; "-inf|qnan"
   ret i1 %class
 }
 
 define i1 @issubnormal_or_nan_f(float %x) {
-; CHECK-32-LABEL: issubnormal_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    decl %eax
-; CHECK-32-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    decl %eax
-; CHECK-64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %cl
+; X86-NEXT:    decl %eax
+; X86-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X86-NEXT:    setb %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_or_nan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %cl
+; X64-NEXT:    decl %eax
+; X64-NEXT:    cmpl $8388607, %eax # imm = 0x7FFFFF
+; X64-NEXT:    setb %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 147)  ; 0x90|0x3 = "subnormal|nan"
   ret i1 %class
 }
 
 define i1 @issubnormal_or_zero_or_nan_f(float %x) {
-; CHECK-32-LABEL: issubnormal_or_zero_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_zero_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_zero_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_or_zero_or_nan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setge %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243)  ; 0xf0|0x3 = "subnormal|zero|nan"
   ret i1 %class
 }
 
 define i1 @issubnormal_or_zero_or_nan_f_daz(float %x) #0 {
-; CHECK-32-LABEL: issubnormal_or_zero_or_nan_f_daz:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fldz
-; CHECK-32-NEXT:    fucompp
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_zero_or_nan_f_daz:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    xorps %xmm1, %xmm1
-; CHECK-64-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_zero_or_nan_f_daz:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fldz
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    sete %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_or_zero_or_nan_f_daz:
+; X64:       # %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 243)  ; 0xf0|0x3 = "subnormal|zero|nan"
   ret i1 %class
 }
 
 define i1 @issubnormal_or_zero_or_snan_f(float %x) {
-; CHECK-32-LABEL: issubnormal_or_zero_or_snan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    movl %eax, %ecx
-; CHECK-32-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %dl
-; CHECK-32-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    andb %dl, %cl
-; CHECK-32-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_zero_or_snan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    movl %eax, %ecx
-; CHECK-64-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %dl
-; CHECK-64-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    andb %dl, %cl
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_zero_or_snan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X86-NEXT:    setl %dl
+; X86-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X86-NEXT:    setge %cl
+; X86-NEXT:    andb %dl, %cl
+; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: issubnormal_or_zero_or_snan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    movl %eax, %ecx
+; X64-GENERIC-NEXT:    andl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %dl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %cl
+; X64-GENERIC-NEXT:    andb %dl, %cl
+; X64-GENERIC-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %al
+; X64-GENERIC-NEXT:    orb %cl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: issubnormal_or_zero_or_snan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax, %ecx # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %ecx # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %dl
+; X64-NDD-NEXT:    cmpl $2139095041, %ecx # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %cl
+; X64-NDD-NEXT:    andb %dl, %cl
+; X64-NDD-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 241)  ; 0x90|0x1 = "subnormal|snan"
   ret i1 %class
 }
 
 define i1 @issubnormal_or_zero_or_qnan_f(float %x) {
-; CHECK-32-LABEL: issubnormal_or_zero_or_qnan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: issubnormal_or_zero_or_qnan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: issubnormal_or_zero_or_qnan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: issubnormal_or_zero_or_qnan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    sete %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NEXT:    setge %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 242)  ; 0x90|0x2 = "subnormal|qnan"
   ret i1 %class
 }
 
 define i1 @not_issubnormal_or_nan_f(float %x) {
-; CHECK-32-LABEL: not_issubnormal_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    testl %eax, %eax
-; CHECK-32-NEXT:    sete %dl
-; CHECK-32-NEXT:    orb %cl, %dl
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    testl %eax, %eax
-; CHECK-64-NEXT:    sete %dl
-; CHECK-64-NEXT:    orb %cl, %dl
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    sete %dl
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setb %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_issubnormal_or_nan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %cl
+; X64-GENERIC-NEXT:    testl %eax, %eax
+; X64-GENERIC-NEXT:    sete %dl
+; X64-GENERIC-NEXT:    orb %cl, %dl
+; X64-GENERIC-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_issubnormal_or_nan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %cl
+; X64-NDD-NEXT:    testl %eax, %eax
+; X64-NDD-NEXT:    sete %dl
+; X64-NDD-NEXT:    orb %dl, %cl
+; X64-NDD-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NDD-NEXT:    setb %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 876)  ; ~(0x90|0x3) = ~"subnormal|nan"
   ret i1 %class
 }
 
 define i1 @not_issubnormal_or_zero_or_nan_f(float %x) {
-; CHECK-32-LABEL: not_issubnormal_or_zero_or_nan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    setne %cl
-; CHECK-32-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setl %al
-; CHECK-32-NEXT:    andb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_zero_or_nan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    setne %cl
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setl %al
-; CHECK-64-NEXT:    andb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_zero_or_nan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    setne %cl
+; X86-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setl %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_or_zero_or_nan_f:
+; X64:       # %bb.0:
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    testl $2139095040, %eax # imm = 0x7F800000
+; X64-NEXT:    setne %cl
+; X64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NEXT:    setl %al
+; X64-NEXT:    andb %cl, %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780)  ; ~(0xf0|0x3) = ~"subnormal|zero|nan"
   ret i1 %class
 }
 
 define i1 @not_issubnormal_or_zero_or_nan_f_daz(float %x) #0 {
-; CHECK-32-LABEL: not_issubnormal_or_zero_or_nan_f_daz:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    flds {{[0-9]+}}(%esp)
-; CHECK-32-NEXT:    fldz
-; CHECK-32-NEXT:    fucompp
-; CHECK-32-NEXT:    fnstsw %ax
-; CHECK-32-NEXT:    # kill: def $ah killed $ah killed $ax
-; CHECK-32-NEXT:    sahf
-; CHECK-32-NEXT:    setne %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_zero_or_nan_f_daz:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    xorps %xmm1, %xmm1
-; CHECK-64-NEXT:    ucomiss %xmm1, %xmm0
-; CHECK-64-NEXT:    setne %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_zero_or_nan_f_daz:
+; X86:       # %bb.0:
+; X86-NEXT:    flds {{[0-9]+}}(%esp)
+; X86-NEXT:    fldz
+; X86-NEXT:    fucompp
+; X86-NEXT:    fnstsw %ax
+; X86-NEXT:    # kill: def $ah killed $ah killed $ax
+; X86-NEXT:    sahf
+; X86-NEXT:    setne %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: not_issubnormal_or_zero_or_nan_f_daz:
+; X64:       # %bb.0:
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    ucomiss %xmm1, %xmm0
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 780)  ; ~(0xf0|0x3) = ~"subnormal|zero|nan"
   ret i1 %class
 }
 
 define i1 @not_issubnormal_or_zero_or_snan_f(float %x) {
-; CHECK-32-LABEL: not_issubnormal_or_zero_or_snan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setge %cl
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %dl
-; CHECK-32-NEXT:    orb %cl, %dl
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    orb %dl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_zero_or_snan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setge %cl
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %dl
-; CHECK-64-NEXT:    orb %cl, %dl
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    orb %dl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_zero_or_snan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setge %cl
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %dl
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setb %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_issubnormal_or_zero_or_snan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setge %cl
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %dl
+; X64-GENERIC-NEXT:    orb %cl, %dl
+; X64-GENERIC-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    orb %dl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_issubnormal_or_zero_or_snan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NDD-NEXT:    setge %cl
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %dl
+; X64-NDD-NEXT:    orb %dl, %cl
+; X64-NDD-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NDD-NEXT:    setb %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 782)  ; ~(0x90|0x1) = ~"subnormal|snan"
   ret i1 %class
 }
 
 define i1 @not_issubnormal_or_zero_or_qnan_f(float %x) {
-; CHECK-32-LABEL: not_issubnormal_or_zero_or_qnan_f:
-; CHECK-32:       # %bb.0:
-; CHECK-32-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-32-NEXT:    andl {{[0-9]+}}(%esp), %eax
-; CHECK-32-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-32-NEXT:    setl %cl
-; CHECK-32-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-32-NEXT:    setge %dl
-; CHECK-32-NEXT:    andb %cl, %dl
-; CHECK-32-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-32-NEXT:    sete %cl
-; CHECK-32-NEXT:    orb %dl, %cl
-; CHECK-32-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-32-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-32-NEXT:    setb %al
-; CHECK-32-NEXT:    orb %cl, %al
-; CHECK-32-NEXT:    retl
-;
-; CHECK-64-LABEL: not_issubnormal_or_zero_or_qnan_f:
-; CHECK-64:       # %bb.0:
-; CHECK-64-NEXT:    movd %xmm0, %eax
-; CHECK-64-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
-; CHECK-64-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
-; CHECK-64-NEXT:    setl %cl
-; CHECK-64-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
-; CHECK-64-NEXT:    setge %dl
-; CHECK-64-NEXT:    andb %cl, %dl
-; CHECK-64-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
-; CHECK-64-NEXT:    sete %cl
-; CHECK-64-NEXT:    orb %dl, %cl
-; CHECK-64-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
-; CHECK-64-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
-; CHECK-64-NEXT:    setb %al
-; CHECK-64-NEXT:    orb %cl, %al
-; CHECK-64-NEXT:    retq
+; X86-LABEL: not_issubnormal_or_zero_or_qnan_f:
+; X86:       # %bb.0:
+; X86-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X86-NEXT:    setl %cl
+; X86-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT:    setge %dl
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X86-NEXT:    sete %cl
+; X86-NEXT:    orb %dl, %cl
+; X86-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X86-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X86-NEXT:    setb %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-GENERIC-LABEL: not_issubnormal_or_zero_or_qnan_f:
+; X64-GENERIC:       # %bb.0:
+; X64-GENERIC-NEXT:    movd %xmm0, %eax
+; X64-GENERIC-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-GENERIC-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-GENERIC-NEXT:    setl %cl
+; X64-GENERIC-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-GENERIC-NEXT:    setge %dl
+; X64-GENERIC-NEXT:    andb %cl, %dl
+; X64-GENERIC-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-GENERIC-NEXT:    sete %cl
+; X64-GENERIC-NEXT:    orb %dl, %cl
+; X64-GENERIC-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-GENERIC-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-GENERIC-NEXT:    setb %al
+; X64-GENERIC-NEXT:    orb %cl, %al
+; X64-GENERIC-NEXT:    retq
+;
+; X64-NDD-LABEL: not_issubnormal_or_zero_or_qnan_f:
+; X64-NDD:       # %bb.0:
+; X64-NDD-NEXT:    movd %xmm0, %eax
+; X64-NDD-NEXT:    andl $2147483647, %eax # imm = 0x7FFFFFFF
+; X64-NDD-NEXT:    cmpl $2143289344, %eax # imm = 0x7FC00000
+; X64-NDD-NEXT:    setl %cl
+; X64-NDD-NEXT:    cmpl $2139095041, %eax # imm = 0x7F800001
+; X64-NDD-NEXT:    setge %dl
+; X64-NDD-NEXT:    andb %dl, %cl
+; X64-NDD-NEXT:    cmpl $2139095040, %eax # imm = 0x7F800000
+; X64-NDD-NEXT:    sete %dl
+; X64-NDD-NEXT:    orb %dl, %cl
+; X64-NDD-NEXT:    addl $-8388608, %eax # imm = 0xFF800000
+; X64-NDD-NEXT:    cmpl $2130706432, %eax # imm = 0x7F000000
+; X64-NDD-NEXT:    setb %al
+; X64-NDD-NEXT:    orb %cl, %al
+; X64-NDD-NEXT:    retq
   %class = tail call i1 @llvm.is.fpclass.f32(float %x, i32 781)  ; ~(0x90|0x2) = ~"subnormal|qnan"
   ret i1 %class
 }
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 0adb9ddfc426a8..d71fd470651cf9 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -7771,6 +7771,234 @@ define <16 x float> @vpaddd_mask_test(<16 x float> %i, <16 x float> %j, <16 x i3
   %r = select <16 x i1> %mask, <16 x float> %x, <16 x float> %i
   ret <16 x float> %r
 }
+
+define <1 x float> @constrained_vector_tan_v1f32() #0 {
+; CHECK-LABEL: constrained_vector_tan_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq tanf@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_tan_v1f32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq tanf@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %tan = call <1 x float> @llvm.experimental.constrained.tan.v1f32(
+                             <1 x float> <float 42.0>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <1 x float> %tan
+}
+
+define <2 x double> @constrained_vector_tan_v2f64() #0 {
+; CHECK-LABEL: constrained_vector_tan_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_tan_v2f64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %tan = call <2 x double> @llvm.experimental.constrained.tan.v2f64(
+                             <2 x double> <double 42.0, double 42.1>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <2 x double> %tan
+}
+
+define <3 x float> @constrained_vector_tan_v3f32() #0 {
+; CHECK-LABEL: constrained_vector_tan_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq tanf@PLT
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq tanf@PLT
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq tanf@PLT
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_tan_v3f32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq tanf@PLT
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq tanf@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq tanf@PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %tan = call <3 x float> @llvm.experimental.constrained.tan.v3f32(
+                              <3 x float> <float 42.0, float 43.0, float 44.0>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #0
+  ret <3 x float> %tan
+}
+
+define <3 x double> @constrained_vector_tan_v3f64() #0 {
+; CHECK-LABEL: constrained_vector_tan_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_tan_v3f64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %tan = call <3 x double> @llvm.experimental.constrained.tan.v3f64(
+                          <3 x double> <double 42.0, double 42.1, double 42.2>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict") #0
+  ret <3 x double> %tan
+}
+
+define <4 x double> @constrained_vector_tan_v4f64() #0 {
+; CHECK-LABEL: constrained_vector_tan_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    callq tan@PLT
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_tan_v4f64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    callq tan@PLT
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %tan = call <4 x double> @llvm.experimental.constrained.tan.v4f64(
+                             <4 x double> <double 42.0, double 42.1,
+                                           double 42.2, double 42.3>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <4 x double> %tan
+}
+
+
+
 declare  <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
 
 attributes #0 = { strictfp }
@@ -7786,6 +8014,7 @@ declare <2 x double> @llvm.experimental.constrained.pow.v2f64(<2 x double>, <2 x
 declare <2 x double> @llvm.experimental.constrained.powi.v2f64(<2 x double>, i32, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.sin.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.cos.v2f64(<2 x double>, metadata, metadata)
+declare <2 x double> @llvm.experimental.constrained.tan.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.exp2.v2f64(<2 x double>, metadata, metadata)
 declare <2 x double> @llvm.experimental.constrained.log.v2f64(<2 x double>, metadata, metadata)
@@ -7829,6 +8058,7 @@ declare <1 x float> @llvm.experimental.constrained.pow.v1f32(<1 x float>, <1 x f
 declare <1 x float> @llvm.experimental.constrained.powi.v1f32(<1 x float>, i32, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.sin.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.cos.v1f32(<1 x float>, metadata, metadata)
+declare <1 x float> @llvm.experimental.constrained.tan.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.exp2.v1f32(<1 x float>, metadata, metadata)
 declare <1 x float> @llvm.experimental.constrained.log.v1f32(<1 x float>, metadata, metadata)
@@ -7882,6 +8112,8 @@ declare <3 x float> @llvm.experimental.constrained.sin.v3f32(<3 x float>, metada
 declare <3 x double> @llvm.experimental.constrained.sin.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.cos.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.cos.v3f64(<3 x double>, metadata, metadata)
+declare <3 x float> @llvm.experimental.constrained.tan.v3f32(<3 x float>, metadata, metadata)
+declare <3 x double> @llvm.experimental.constrained.tan.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp.v3f32(<3 x float>, metadata, metadata)
 declare <3 x double> @llvm.experimental.constrained.exp.v3f64(<3 x double>, metadata, metadata)
 declare <3 x float> @llvm.experimental.constrained.exp2.v3f32(<3 x float>, metadata, metadata)
@@ -7938,6 +8170,7 @@ declare <4 x double> @llvm.experimental.constrained.pow.v4f64(<4 x double>, <4 x
 declare <4 x double> @llvm.experimental.constrained.powi.v4f64(<4 x double>, i32, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.sin.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.cos.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.exp2.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.log.v4f64(<4 x double>, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 929671d674e5e3..4a47e7613dfa6d 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -543,10 +543,10 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
@@ -573,19 +573,19 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %al, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
@@ -651,10 +651,10 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
@@ -681,19 +681,19 @@ define void @load_2byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %ax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movw %cx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_2byte_chunk_of_16byte_alloca:
@@ -758,10 +758,10 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leal (%rax,%rax), %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    testb $64, %sil
@@ -788,19 +788,19 @@ define void @load_4byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-HAVE-BMI2-NO-SHLD:       # %bb.0:
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movdqu (%rdi), %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm1, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %xmm0, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rax,%rax), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (%rcx,%rcx), %r8d
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r8, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rsi, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    testb $64, %sil
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rdi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    cmoveq %rax, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-HAVE-BMI2-HAVE-SHLD-LABEL: load_4byte_chunk_of_16byte_alloca:
diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll
index b92408a1bf1cd5..78275a16d3e8f7 100644
--- a/llvm/test/Feature/fp-intrinsics.ll
+++ b/llvm/test/Feature/fp-intrinsics.ll
@@ -151,6 +151,17 @@ entry:
   ret double %result
 }
 
+; Verify that tan(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: ftan
+; CHECK: call double @llvm.experimental.constrained.tan
+define double @ftan() #0 {
+entry:
+  %result = call double @llvm.experimental.constrained.tan.f64(double 42.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret double %result
+}
+
 ; Verify that exp(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: f10
 ; CHECK: call double @llvm.experimental.constrained.exp
@@ -407,6 +418,7 @@ declare double @llvm.experimental.constrained.pow.f64(double, double, metadata,
 declare double @llvm.experimental.constrained.powi.f64(double, i32, metadata, metadata)
 declare double @llvm.experimental.constrained.sin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cos.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.exp.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.exp2.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.log.f64(double, metadata, metadata)
diff --git a/llvm/test/MC/RISCV/rvv/vsetvl.s b/llvm/test/MC/RISCV/rvv/vsetvl.s
index c9197d8917a472..2741def0eeff21 100644
--- a/llvm/test/MC/RISCV/rvv/vsetvl.s
+++ b/llvm/test/MC/RISCV/rvv/vsetvl.s
@@ -1,5 +1,7 @@
 # RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+v %s \
 # RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+zve32x %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ZVE32X
 # RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
 # RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
 # RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+v %s \
@@ -71,18 +73,21 @@ vsetvli a2, a0, e32, m8, ta, ma
 
 vsetvli a2, a0, e32, mf2, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf2, ta, ma
+# CHECK-ZVE32X: :[[#@LINE-2]]:17: warning: use of vtype encodings with SEW > 16 and LMUL == mf2 may not be compatible with all RVV implementations{{$}}
 # CHECK-ENCODING: [0x57,0x76,0x75,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
 # CHECK-UNKNOWN: 0d757657 <unknown>
 
 vsetvli a2, a0, e32, mf4, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf4, ta, ma
+# CHECK-ZVE32X: :[[#@LINE-2]]:17: warning: use of vtype encodings with SEW > 8 and LMUL == mf4 may not be compatible with all RVV implementations{{$}}
 # CHECK-ENCODING: [0x57,0x76,0x65,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
 # CHECK-UNKNOWN: 0d657657 <unknown>
 
 vsetvli a2, a0, e32, mf8, ta, ma
 # CHECK-INST: vsetvli a2, a0, e32, mf8, ta, ma
+# CHECK-ZVE32X: :[[#@LINE-2]]:22: warning: use of vtype encodings with LMUL < SEWMIN/ELEN == mf4 is reserved{{$}}
 # CHECK-ENCODING: [0x57,0x76,0x55,0x0d]
 # CHECK-ERROR: instruction requires the following: 'V' (Vector Extension for Application Processors), 'Zve32x' (Vector Extensions for Embedded Processors){{$}}
 # CHECK-UNKNOWN: 0d557657 <unknown>
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 17318b2c62ca8c..c9366f4434b1c1 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -1750,7 +1750,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]]
+; CI-NEXT:    [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]]
 ; CI-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
 ; CI-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
 ; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
@@ -1766,7 +1766,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_unsafe_structfp(ptr addrspace
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]]
+; GFX9-NEXT:    [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]]
 ; GFX9-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
 ; GFX9-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
 ; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 4
@@ -1803,7 +1803,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]]
+; CI-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]]
 ; CI-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
 ; CI-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
 ; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
@@ -1819,7 +1819,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
 ; GFX9-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX9:       atomicrmw.start:
 ; GFX9-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX9-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]]
+; GFX9-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]]
 ; GFX9-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
 ; GFX9-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
 ; GFX9-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
@@ -1835,7 +1835,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
 ; GFX908-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX908:       atomicrmw.start:
 ; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]]
+; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]]
 ; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
 ; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
 ; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
@@ -1859,7 +1859,7 @@ define double @test_atomicrmw_fadd_f64_global_unsafe_strictfp(ptr addrspace(1) %
 ; GFX11-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; GFX11:       atomicrmw.start:
 ; GFX11-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX11-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9:[0-9]+]]
+; GFX11-NEXT:    [[NEW:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[LOADED]], double [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10:[0-9]+]]
 ; GFX11-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
 ; GFX11-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
 ; GFX11-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("wavefront") monotonic monotonic, align 8
@@ -1880,7 +1880,7 @@ define float @test_atomicrmw_fadd_f32_local_strictfp(ptr addrspace(3) %ptr, floa
 ; CI-NEXT:    br label [[ATOMICRMW_START:%.*]]
 ; CI:       atomicrmw.start:
 ; CI-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; CI-NEXT:    [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]]
+; CI-NEXT:    [[NEW:%.*]] = call float @llvm.experimental.constrained.fadd.f32(float [[LOADED]], float [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]]
 ; CI-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
 ; CI-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
 ; CI-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
@@ -2102,7 +2102,7 @@ define bfloat @test_atomicrmw_fadd_bf16_global_system_align4(ptr addrspace(1) %p
 
 define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bfloat %value) #2 {
 ; ALL-LABEL: @test_atomicrmw_fadd_bf16_local_strictfp(
-; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR9:[0-9]+]]
+; ALL-NEXT:    [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4) #[[ATTR10:[0-9]+]]
 ; ALL-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
 ; ALL-NEXT:    [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
 ; ALL-NEXT:    [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
@@ -2115,7 +2115,7 @@ define bfloat @test_atomicrmw_fadd_bf16_local_strictfp(ptr addrspace(3) %ptr, bf
 ; ALL-NEXT:    [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
 ; ALL-NEXT:    [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
 ; ALL-NEXT:    [[TMP4:%.*]] = bitcast i16 [[EXTRACTED]] to bfloat
-; ALL-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR9]]
+; ALL-NEXT:    [[NEW:%.*]] = call bfloat @llvm.experimental.constrained.fadd.bf16(bfloat [[TMP4]], bfloat [[VALUE:%.*]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR10]]
 ; ALL-NEXT:    [[TMP5:%.*]] = bitcast bfloat [[NEW]] to i16
 ; ALL-NEXT:    [[EXTENDED:%.*]] = zext i16 [[TMP5]] to i32
 ; ALL-NEXT:    [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
@@ -4669,11 +4669,180 @@ define void @test_atomicrmw_fadd_v2bf16_flat_local_noret(ptr addrspace(3) %ptr,
   ret void
 }
 
+define <2 x half> @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(ptr %ptr, <2 x half> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret <2 x half> %res
+}
+
+define void @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(ptr %ptr, <2 x half> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2f16_flat_agent_noret__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x half> @test_atomicrmw_fadd_v2f16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret <2 x half> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret <2 x half> %res
+}
+
+define void @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x half> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2f16_global_agent_noret__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr addrspace(1) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x half> %value syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(ptr %ptr, <2 x bfloat> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret <2 x bfloat> %res
+}
+
+define void @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(ptr %ptr, <2 x bfloat> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_flat_agent_noret__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret void
+}
+
+define <2 x bfloat> @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret <2 x bfloat> %res
+}
+
+define void @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(ptr addrspace(1) %ptr, <2 x bfloat> %value) #6 {
+; ALL-LABEL: @test_atomicrmw_fadd_v2bf16_global_agent_noret__unsafe(
+; ALL-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr addrspace(1) [[PTR:%.*]], align 4
+; ALL-NEXT:    br label [[ATOMICRMW_START:%.*]]
+; ALL:       atomicrmw.start:
+; ALL-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE:%.*]]
+; ALL-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; ALL-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; ALL-NEXT:    [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; ALL-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; ALL-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; ALL-NEXT:    br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; ALL:       atomicrmw.end:
+; ALL-NEXT:    ret void
+;
+  %res = atomicrmw fadd ptr addrspace(1) %ptr, <2 x bfloat> %value syncscope("agent") seq_cst
+  ret void
+}
+
 attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
 attributes #2 = { strictfp }
 attributes #3 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #4 = { "denormal-fp-math-f32"="dynamic,dynamic" }
 attributes #5 = { "denormal-fp-math"="dynamic,dynamic" }
+attributes #6 = { "amdgpu-unsafe-fp-atomics"="true" }
 
 !0 = !{}
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 871913610429b3..f968fa6f9afd0c 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -419,20 +419,48 @@ define ptr @test_index_canon_nusw_nuw(ptr %X, i32 %Idx) {
   ret ptr %R
 }
 
-define ptr @test_index_canon_const_expr_inbounds(ptr %X, i32 %Idx) {
+define ptr @test_index_canon_const_expr_inbounds() {
 ; CHECK-LABEL: @test_index_canon_const_expr_inbounds(
 ; CHECK-NEXT:    ret ptr getelementptr inbounds (i8, ptr @Global, i64 123)
 ;
   ret ptr getelementptr inbounds (i8, ptr @Global, i32 123)
 }
 
-define ptr @test_index_canon_const_expr_nuw_nusw(ptr %X, i32 %Idx) {
+define ptr @test_index_canon_const_expr_nuw_nusw() {
 ; CHECK-LABEL: @test_index_canon_const_expr_nuw_nusw(
-; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @Global, i64 123)
+; CHECK-NEXT:    ret ptr getelementptr nusw nuw (i8, ptr @Global, i64 123)
 ;
   ret ptr getelementptr nusw nuw (i8, ptr @Global, i32 123)
 }
 
+define ptr @test_const_gep_gep_nuw() {
+; CHECK-LABEL: @test_const_gep_gep_nuw(
+; CHECK-NEXT:    ret ptr getelementptr nuw (i8, ptr @Global, i64 246)
+;
+  ret ptr getelementptr nuw (i8, ptr getelementptr nuw (i8, ptr @Global, i64 123), i64 123)
+}
+
+define ptr @test_const_gep_gep_nusw_no_overflow() {
+; CHECK-LABEL: @test_const_gep_gep_nusw_no_overflow(
+; CHECK-NEXT:    ret ptr getelementptr nusw (i8, ptr @Global, i64 246)
+;
+  ret ptr getelementptr nusw (i8, ptr getelementptr nusw (i8, ptr @Global, i64 123), i64 123)
+}
+
+define ptr @test_const_gep_gep_nusw_no_overflow_neg() {
+; CHECK-LABEL: @test_const_gep_gep_nusw_no_overflow_neg(
+; CHECK-NEXT:    ret ptr getelementptr nusw (i8, ptr @Global, i64 -246)
+;
+  ret ptr getelementptr nusw (i8, ptr getelementptr nusw (i8, ptr @Global, i64 -123), i64 -123)
+}
+
+define ptr @test_const_gep_gep_nusw_overflow() {
+; CHECK-LABEL: @test_const_gep_gep_nusw_overflow(
+; CHECK-NEXT:    ret ptr getelementptr (i8, ptr @Global, i64 -2)
+;
+  ret ptr getelementptr nusw (i8, ptr getelementptr nusw (i8, ptr @Global, i64 u0x7fffffffffffffff), i64 u0x7fffffffffffffff)
+}
+
 define i1 @test17(ptr %P, i32 %I, i32 %J) {
 ; CHECK-LABEL: @test17(
 ; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 [[I:%.*]], [[J:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll
index 304874645d5dc9..7ff111c42a9e06 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-or-x.ll
@@ -399,3 +399,13 @@ define i1 @icmp_eq_x_invertable_y2(i8 %x, i8 %y) {
   %r = icmp eq i8 %yy, %or
   ret i1 %r
 }
+
+define i1 @PR38139(i8 %arg) {
+; CHECK-LABEL: @PR38139(
+; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[ARG:%.*]], -64
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %masked = or i8 %arg, 192
+  %r = icmp ne i8 %masked, %arg
+  ret i1 %r
+}
diff --git a/llvm/test/Transforms/InstCombine/ldexp-zext.ll b/llvm/test/Transforms/InstCombine/ldexp-ext.ll
similarity index 51%
rename from llvm/test/Transforms/InstCombine/ldexp-zext.ll
rename to llvm/test/Transforms/InstCombine/ldexp-ext.ll
index b6e4f124940595..4608553eb88743 100644
--- a/llvm/test/Transforms/InstCombine/ldexp-zext.ll
+++ b/llvm/test/Transforms/InstCombine/ldexp-ext.ll
@@ -55,3 +55,58 @@ define <2 x float> @ldexp_zext_float_vector(<2 x float> %x, <2 x i1> %bool) {
   %ldexp = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> %zext)
   ret <2 x float> %ldexp
 }
+
+define float @ldexp_sext_float(float %x, i1 %bool) {
+; CHECK-LABEL: @ldexp_sext_float(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[BOOL:%.*]], float 5.000000e-01, float 1.000000e+00
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul float [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret float [[LDEXP]]
+;
+  %sext = sext i1 %bool to i32
+  %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %sext)
+  ret float %ldexp
+}
+
+define float @ldexp_sext_float_negative(float %x, i8 %y) {
+; CHECK-LABEL: @ldexp_sext_float_negative(
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i8 [[Y:%.*]] to i32
+; CHECK-NEXT:    [[LDEXP:%.*]] = call float @llvm.ldexp.f32.i32(float [[X:%.*]], i32 [[SEXT]])
+; CHECK-NEXT:    ret float [[LDEXP]]
+;
+  %sext = sext i8 %y to i32
+  %ldexp = call float @llvm.ldexp.f32.i32(float %x, i32 %sext)
+  ret float %ldexp
+}
+
+define double @ldexp_sext_double(double %x, i1 %bool) {
+; CHECK-LABEL: @ldexp_sext_double(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 5.000000e-01, double 1.000000e+00
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul double [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret double [[LDEXP]]
+;
+  %sext = sext i1 %bool to i32
+  %ldexp = call double @llvm.ldexp.f64.i32(double %x, i32 %sext)
+  ret double %ldexp
+}
+
+define double @ldexp_sext_double_fast_math(double %x, i1 %bool) {
+; CHECK-LABEL: @ldexp_sext_double_fast_math(
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 [[BOOL:%.*]], double 5.000000e-01, double 1.000000e+00
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul reassoc double [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret double [[LDEXP]]
+;
+  %sext = sext i1 %bool to i32
+  %ldexp = call reassoc double @llvm.ldexp.f64.i32(double %x, i32 %sext)
+  ret double %ldexp
+}
+
+define <2 x float> @ldexp_sext_float_vector(<2 x float> %x, <2 x i1> %bool) {
+; CHECK-LABEL: @ldexp_sext_float_vector(
+; CHECK-NEXT:    [[TMP1:%.*]] = select <2 x i1> [[BOOL:%.*]], <2 x float> <float 5.000000e-01, float 5.000000e-01>, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+; CHECK-NEXT:    [[LDEXP:%.*]] = fmul <2 x float> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x float> [[LDEXP]]
+;
+  %sext = sext <2 x i1> %bool to <2 x i32>
+  %ldexp = call <2 x float> @llvm.ldexp.v2f32.v2i32(<2 x float> %x, <2 x i32> %sext)
+  ret <2 x float> %ldexp
+}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll b/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll
index 8e06cd57a10014..0abca5b383224a 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/basic.ll
@@ -284,7 +284,7 @@ define i32 @test10(i64 %t6) {
 
 declare i1 @llvm.expect.i1(i1, i1) nounwind readnone
 
-; CHECK: !0 = !{!"branch_weights", !"expected", i32 2000, i32 1}
-; CHECK: !1 = !{!"branch_weights", !"expected", i32 1, i32 2000}
-; CHECK: !2 = !{!"branch_weights", !"expected", i32 1, i32 1, i32 2000}
-; CHECK: !3 = !{!"branch_weights", !"expected", i32 2000, i32 1, i32 1}
+; CHECK: !0 = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: !1 = !{!"branch_weights", i32 1, i32 2000}
+; CHECK: !2 = !{!"branch_weights", i32 1, i32 1, i32 2000}
+; CHECK: !3 = !{!"branch_weights", i32 2000, i32 1, i32 1}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll b/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll
index 40571278ca93f3..64293557050c12 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/expect-with-probability.ll
@@ -284,7 +284,7 @@ define i32 @test10(i64 %t6) {
 
 declare i1 @llvm.expect.with.probability.i1(i1, i1, double) nounwind readnone
 
-; CHECK: !0 = !{!"branch_weights", !"expected", i32 1717986918, i32 429496731}
-; CHECK: !1 = !{!"branch_weights", !"expected", i32 429496731, i32 1717986918}
-; CHECK: !2 = !{!"branch_weights", !"expected", i32 214748366, i32 214748366, i32 1717986918}
-; CHECK: !3 = !{!"branch_weights", !"expected", i32 1717986918, i32 214748366, i32 214748366}
+; CHECK: !0 = !{!"branch_weights", i32 1717986918, i32 429496731}
+; CHECK: !1 = !{!"branch_weights", i32 429496731, i32 1717986918}
+; CHECK: !2 = !{!"branch_weights", i32 214748366, i32 214748366, i32 1717986918}
+; CHECK: !3 = !{!"branch_weights", i32 1717986918, i32 214748366, i32 214748366}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll b/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
index 458a7758fa9707..2bcfb1e064be96 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/expect_nonboolean.ll
@@ -99,5 +99,6 @@ attributes #1 = { nounwind readnone }
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{!"clang version 5.0.0 (trunk 304373)"}
-; CHECK: [[LIKELY]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
-; CHECK: [[UNLIKELY]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
+; CHECK: [[LIKELY]] = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: [[UNLIKELY]] = !{!"branch_weights", i32 1, i32 2000}
+
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
index 9b9d9a746dd324..32ae9b0b2f15cb 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_merge.ll
@@ -352,5 +352,5 @@ declare i64 @llvm.expect.i64(i64, i64)
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 5.0.0 (trunk 302965)"}
-; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
-; CHECK: [[WEIGHT2]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll
index e9a843225993ad..1efa63241c2c06 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_or.ll
@@ -99,5 +99,5 @@ declare i64 @llvm.expect.i64(i64, i64)
 
 
 !0 = !{!"clang version 5.0.0 (trunk 302965)"}
-; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
-; CHECK: [[WEIGHT2]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 2000, i32 1}
+; CHECK: [[WEIGHT2]] = !{!"branch_weights", i32 1, i32 2000}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
index 13db2c394bab29..9cbaca8d13dc0a 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_tern.ll
@@ -53,4 +53,4 @@ declare i64 @llvm.expect.i64(i64, i64)
 
 !0 = !{!"clang version 5.0.0 (trunk 302965)"}
 
-; CHECK: [[WEIGHT]] = !{!"branch_weights", !"expected", i32 1, i32 2000}
+; CHECK: [[WEIGHT]] = !{!"branch_weights", i32 1, i32 2000}
diff --git a/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll b/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll
index 275731d6188955..2bad66343b761b 100644
--- a/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll
+++ b/llvm/test/Transforms/LowerExpectIntrinsic/phi_unexpect.ll
@@ -235,5 +235,5 @@ block5:
   ret void
 }
 
-; CHECK: !0 = !{!"branch_weights", !"expected", i32 2147483647, i32 1}
-; CHECK: !1 = !{!"branch_weights", !"expected", i32 1, i32 2147483647}
+; CHECK: !0 = !{!"branch_weights", i32 2147483647, i32 1}
+; CHECK: !1 = !{!"branch_weights", i32 1, i32 2147483647}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
index a090fe20e1d937..55dd28b70170b8 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoist-runtime-checks.ll
@@ -293,7 +293,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
-; CHECK: [[PROF4]] = !{!"branch_weights", !"expected", i32 2000, i32 1}
+; CHECK: [[PROF4]] = !{!"branch_weights", i32 2000, i32 1}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/tools/dsymutil/MachOUtils.cpp b/llvm/tools/dsymutil/MachOUtils.cpp
index 8e144d640ed01f..b52ab1ce6d2949 100644
--- a/llvm/tools/dsymutil/MachOUtils.cpp
+++ b/llvm/tools/dsymutil/MachOUtils.cpp
@@ -630,7 +630,7 @@ bool generateDsymCompanion(
 
   // Emit the Dwarf sections contents.
   for (const MCSection &Sec : MCAsm) {
-    if (Sec.begin() == Sec.end())
+    if (Sec.empty())
       continue;
 
     uint64_t Pos = OutFile.tell();
diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp
index 265ebcbff5877e..0b175a3852e425 100644
--- a/llvm/tools/gold/gold-plugin.cpp
+++ b/llvm/tools/gold/gold-plugin.cpp
@@ -307,7 +307,8 @@ namespace options {
     } else if (opt.consume_front("opt-remarks-hotness-threshold=")) {
       auto ResultOrErr = remarks::parseHotnessThresholdOption(opt);
       if (!ResultOrErr)
-        message(LDPL_FATAL, "Invalid remarks hotness threshold: %s", opt);
+        message(LDPL_FATAL, "Invalid remarks hotness threshold: %s",
+                opt.data());
       else
         RemarksHotnessThreshold = *ResultOrErr;
     } else if (opt.consume_front("opt-remarks-format=")) {
@@ -319,7 +320,7 @@ namespace options {
     } else if (opt.consume_front("time-trace-granularity=")) {
       unsigned Granularity;
       if (opt.getAsInteger(10, Granularity))
-        message(LDPL_FATAL, "Invalid time trace granularity: %s", opt);
+        message(LDPL_FATAL, "Invalid time trace granularity: %s", opt.data());
       else
         time_trace_granularity = Granularity;
     } else {
diff --git a/llvm/tools/llvm-as/llvm-as.cpp b/llvm/tools/llvm-as/llvm-as.cpp
index 0958e16c2197ac..e48e3f4d22c123 100644
--- a/llvm/tools/llvm-as/llvm-as.cpp
+++ b/llvm/tools/llvm-as/llvm-as.cpp
@@ -142,10 +142,11 @@ int main(int argc, char **argv) {
   }
 
   // Convert to new debug format if requested.
-  M->setIsNewDbgInfoFormat(UseNewDbgInfoFormat &&
-                           WriteNewDbgInfoFormatToBitcode);
-  if (M->IsNewDbgInfoFormat)
+  assert(!M->IsNewDbgInfoFormat && "Unexpectedly in new debug mode");
+  if (UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode) {
+    M->convertToNewDbgValues();
     M->removeDebugIntrinsicDeclarations();
+  }
 
   std::unique_ptr<ModuleSummaryIndex> Index = std::move(ModuleAndIndex.Index);
 
diff --git a/llvm/tools/llvm-dis/llvm-dis.cpp b/llvm/tools/llvm-dis/llvm-dis.cpp
index d28af85bc739eb..fbbb5506e43e05 100644
--- a/llvm/tools/llvm-dis/llvm-dis.cpp
+++ b/llvm/tools/llvm-dis/llvm-dis.cpp
@@ -258,7 +258,7 @@ int main(int argc, char **argv) {
       // All that llvm-dis does is write the assembly to a file.
       if (!DontPrint) {
         if (M) {
-          M->setIsNewDbgInfoFormat(WriteNewDbgInfoFormat);
+          ScopedDbgInfoFormatSetter FormatSetter(*M, WriteNewDbgInfoFormat);
           if (WriteNewDbgInfoFormat)
             M->removeDebugIntrinsicDeclarations();
           M->print(Out->os(), Annotator.get(), PreserveAssemblyUseListOrder);
diff --git a/llvm/tools/llvm-link/llvm-link.cpp b/llvm/tools/llvm-link/llvm-link.cpp
index b84469d1c757f8..7794f2d81ed064 100644
--- a/llvm/tools/llvm-link/llvm-link.cpp
+++ b/llvm/tools/llvm-link/llvm-link.cpp
@@ -489,6 +489,12 @@ int main(int argc, char **argv) {
   if (LoadBitcodeIntoNewDbgInfoFormat == cl::boolOrDefault::BOU_UNSET)
     LoadBitcodeIntoNewDbgInfoFormat = cl::boolOrDefault::BOU_TRUE;
 
+  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
+  // new debug-info format.
+  if (TryUseNewDbgInfoFormat) {
+    // Turn the new debug-info format on.
+    UseNewDbgInfoFormat = true;
+  }
   // Since llvm-link collects multiple IR modules together, for simplicity's
   // sake we disable the "PreserveInputDbgFormat" flag to enforce a single
   // debug info format.
@@ -550,7 +556,7 @@ int main(int argc, char **argv) {
     SetFormat(WriteNewDbgInfoFormat);
     Composite->print(Out.os(), nullptr, PreserveAssemblyUseListOrder);
   } else if (Force || !CheckBitcodeOutputToConsole(Out.os())) {
-    SetFormat(UseNewDbgInfoFormat && WriteNewDbgInfoFormatToBitcode);
+    SetFormat(WriteNewDbgInfoFormatToBitcode);
     WriteBitcodeToFile(*Composite, Out.os(), PreserveBitcodeUseListOrder);
   }
 
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp
index b97d75a822f753..5f0697f5aaad75 100644
--- a/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp
+++ b/llvm/tools/llvm-reduce/deltas/ReduceInstructionsMIR.cpp
@@ -65,7 +65,7 @@ static bool shouldNotRemoveInstruction(const TargetInstrInfo &TII,
 
 static void extractInstrFromFunction(Oracle &O, MachineFunction &MF) {
   MachineDominatorTree MDT;
-  MDT.runOnMachineFunction(MF);
+  MDT.calculate(MF);
 
   auto MRI = &MF.getRegInfo();
   SetVector<MachineInstr *> ToDelete;
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index 6e4dda8351a1b1..7007d944801a75 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -723,11 +723,13 @@ TEST(APFloatTest, IsSmallestNormalized) {
     EXPECT_FALSE(APFloat::getZero(Semantics, false).isSmallestNormalized());
     EXPECT_FALSE(APFloat::getZero(Semantics, true).isSmallestNormalized());
 
-    EXPECT_FALSE(APFloat::getInf(Semantics, false).isSmallestNormalized());
-    EXPECT_FALSE(APFloat::getInf(Semantics, true).isSmallestNormalized());
+    if (APFloat::hasNanOrInf(Semantics)) {
+      EXPECT_FALSE(APFloat::getInf(Semantics, false).isSmallestNormalized());
+      EXPECT_FALSE(APFloat::getInf(Semantics, true).isSmallestNormalized());
 
-    EXPECT_FALSE(APFloat::getQNaN(Semantics).isSmallestNormalized());
-    EXPECT_FALSE(APFloat::getSNaN(Semantics).isSmallestNormalized());
+      EXPECT_FALSE(APFloat::getQNaN(Semantics).isSmallestNormalized());
+      EXPECT_FALSE(APFloat::getSNaN(Semantics).isSmallestNormalized());
+    }
 
     EXPECT_FALSE(APFloat::getLargest(Semantics).isSmallestNormalized());
     EXPECT_FALSE(APFloat::getLargest(Semantics, true).isSmallestNormalized());
@@ -1823,6 +1825,9 @@ TEST(APFloatTest, getLargest) {
       30, APFloat::getLargest(APFloat::Float8E4M3B11FNUZ()).convertToDouble());
   EXPECT_EQ(3.40116213421e+38f,
             APFloat::getLargest(APFloat::FloatTF32()).convertToFloat());
+  EXPECT_EQ(28, APFloat::getLargest(APFloat::Float6E3M2FN()).convertToDouble());
+  EXPECT_EQ(7.5,
+            APFloat::getLargest(APFloat::Float6E2M3FN()).convertToDouble());
 }
 
 TEST(APFloatTest, getSmallest) {
@@ -1881,6 +1886,20 @@ TEST(APFloatTest, getSmallest) {
   EXPECT_TRUE(test.isFiniteNonZero());
   EXPECT_TRUE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+  expected = APFloat(APFloat::Float6E3M2FN(), "0x0.1p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  test = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+  expected = APFloat(APFloat::Float6E2M3FN(), "0x0.2p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
 }
 
 TEST(APFloatTest, getSmallestNormalized) {
@@ -1963,6 +1982,21 @@ TEST(APFloatTest, getSmallestNormalized) {
   EXPECT_FALSE(test.isDenormal());
   EXPECT_TRUE(test.bitwiseIsEqual(expected));
   EXPECT_TRUE(test.isSmallestNormalized());
+  test = APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
+  expected = APFloat(APFloat::Float6E3M2FN(), "0x1p-2");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+  EXPECT_TRUE(test.isSmallestNormalized());
+
+  test = APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false);
+  expected = APFloat(APFloat::Float6E2M3FN(), "0x1p0");
+  EXPECT_FALSE(test.isNegative());
+  EXPECT_TRUE(test.isFiniteNonZero());
+  EXPECT_FALSE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+  EXPECT_TRUE(test.isSmallestNormalized());
 }
 
 TEST(APFloatTest, getZero) {
@@ -1996,7 +2030,11 @@ TEST(APFloatTest, getZero) {
       {&APFloat::Float8E4M3B11FNUZ(), false, false, {0, 0}, 1},
       {&APFloat::Float8E4M3B11FNUZ(), true, false, {0, 0}, 1},
       {&APFloat::FloatTF32(), false, true, {0, 0}, 1},
-      {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1}};
+      {&APFloat::FloatTF32(), true, true, {0x40000ULL, 0}, 1},
+      {&APFloat::Float6E3M2FN(), false, true, {0, 0}, 1},
+      {&APFloat::Float6E3M2FN(), true, true, {0x20ULL, 0}, 1},
+      {&APFloat::Float6E2M3FN(), false, true, {0, 0}, 1},
+      {&APFloat::Float6E2M3FN(), true, true, {0x20ULL, 0}, 1}};
   const unsigned NumGetZeroTests = std::size(GetZeroTest);
   for (unsigned i = 0; i < NumGetZeroTests; ++i) {
     APFloat test = APFloat::getZero(*GetZeroTest[i].semantics,
@@ -5161,6 +5199,90 @@ TEST(APFloatTest, Float8ExhaustivePair) {
   }
 }
 
+TEST(APFloatTest, Float6ExhaustivePair) {
+  // Test each pair of 6-bit floats with non-standard semantics
+  for (APFloat::Semantics Sem :
+       {APFloat::S_Float6E3M2FN, APFloat::S_Float6E2M3FN}) {
+    const llvm::fltSemantics &S = APFloat::EnumToSemantics(Sem);
+    for (int i = 1; i < 64; i++) {
+      for (int j = 1; j < 64; j++) {
+        SCOPED_TRACE("sem=" + std::to_string(Sem) + ",i=" + std::to_string(i) +
+                     ",j=" + std::to_string(j));
+        APFloat x(S, APInt(6, i));
+        APFloat y(S, APInt(6, j));
+
+        bool losesInfo;
+        APFloat x16 = x;
+        x16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+        APFloat y16 = y;
+        y16.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
+                    &losesInfo);
+        EXPECT_FALSE(losesInfo);
+
+        // Add
+        APFloat z = x;
+        z.add(y, APFloat::rmNearestTiesToEven);
+        APFloat z16 = x16;
+        z16.add(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Subtract
+        z = x;
+        z.subtract(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.subtract(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Multiply
+        z = x;
+        z.multiply(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.multiply(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Skip divide by 0
+        if (j == 0 || j == 32)
+          continue;
+
+        // Divide
+        z = x;
+        z.divide(y, APFloat::rmNearestTiesToEven);
+        z16 = x16;
+        z16.divide(y16, APFloat::rmNearestTiesToEven);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Mod
+        z = x;
+        z.mod(y);
+        z16 = x16;
+        z16.mod(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+
+        // Remainder
+        z = x;
+        z.remainder(y);
+        z16 = x16;
+        z16.remainder(y16);
+        z16.convert(S, APFloat::rmNearestTiesToEven, &losesInfo);
+        EXPECT_TRUE(z.bitwiseIsEqual(z16))
+            << "sem=" << Sem << ", i=" << i << ", j=" << j;
+      }
+    }
+  }
+}
+
 TEST(APFloatTest, ConvertE4M3FNToE5M2) {
   bool losesInfo;
   APFloat test(APFloat::Float8E4M3FN(), "1.0");
@@ -6620,28 +6742,39 @@ TEST(APFloatTest, getExactLog2) {
     EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2());
     EXPECT_EQ(INT_MIN, APFloat(Semantics, "3.0").getExactLog2Abs());
     EXPECT_EQ(INT_MIN, APFloat(Semantics, "-3.0").getExactLog2Abs());
-    EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2());
-    EXPECT_EQ(INT_MIN, APFloat(Semantics, "-8.0").getExactLog2());
-    EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2());
-    EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2Abs());
-    EXPECT_EQ(INT_MIN, APFloat(Semantics, "-0.25").getExactLog2());
-    EXPECT_EQ(-2, APFloat(Semantics, "-0.25").getExactLog2Abs());
-    EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2Abs());
-    EXPECT_EQ(3, APFloat(Semantics, "-8.0").getExactLog2Abs());
+
+    if (I == APFloat::S_Float6E2M3FN) {
+      EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2());
+      EXPECT_EQ(INT_MIN, APFloat(Semantics, "-4.0").getExactLog2());
+      EXPECT_EQ(2, APFloat(Semantics, "4.0").getExactLog2Abs());
+      EXPECT_EQ(2, APFloat(Semantics, "-4.0").getExactLog2Abs());
+    } else {
+      EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2());
+      EXPECT_EQ(INT_MIN, APFloat(Semantics, "-8.0").getExactLog2());
+      EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2());
+      EXPECT_EQ(-2, APFloat(Semantics, "0.25").getExactLog2Abs());
+      EXPECT_EQ(INT_MIN, APFloat(Semantics, "-0.25").getExactLog2());
+      EXPECT_EQ(-2, APFloat(Semantics, "-0.25").getExactLog2Abs());
+      EXPECT_EQ(3, APFloat(Semantics, "8.0").getExactLog2Abs());
+      EXPECT_EQ(3, APFloat(Semantics, "-8.0").getExactLog2Abs());
+    }
 
     EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, false).getExactLog2());
     EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, true).getExactLog2());
-    EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2());
-    EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2());
-    EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2());
-    EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2());
-
     EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, false).getExactLog2Abs());
     EXPECT_EQ(INT_MIN, APFloat::getZero(Semantics, true).getExactLog2Abs());
-    EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2Abs());
-    EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2Abs());
-    EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2Abs());
-    EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2Abs());
+
+    if (APFloat::hasNanOrInf(Semantics)) {
+      EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2());
+      EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2());
+      EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2());
+      EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2());
+
+      EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics).getExactLog2Abs());
+      EXPECT_EQ(INT_MIN, APFloat::getInf(Semantics, true).getExactLog2Abs());
+      EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, false).getExactLog2Abs());
+      EXPECT_EQ(INT_MIN, APFloat::getNaN(Semantics, true).getExactLog2Abs());
+    }
 
     EXPECT_EQ(INT_MIN,
               scalbn(One, MinExp - Precision - 1, APFloat::rmNearestTiesToEven)
@@ -6660,4 +6793,311 @@ TEST(APFloatTest, getExactLog2) {
   }
 }
 
+TEST(APFloatTest, Float6E3M2FNFromString) {
+  // Exactly representable
+  EXPECT_EQ(28, APFloat(APFloat::Float6E3M2FN(), "28").convertToDouble());
+  // Round down to maximum value
+  EXPECT_EQ(28, APFloat(APFloat::Float6E3M2FN(), "32").convertToDouble());
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  EXPECT_DEATH(APFloat(APFloat::Float6E3M2FN(), "inf"),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat(APFloat::Float6E3M2FN(), "nan"),
+               "This floating point format does not support NaN");
+#endif
+#endif
+
+  EXPECT_TRUE(APFloat(APFloat::Float6E3M2FN(), "0").isPosZero());
+  EXPECT_TRUE(APFloat(APFloat::Float6E3M2FN(), "-0").isNegZero());
+}
+
+TEST(APFloatTest, Float6E2M3FNFromString) {
+  // Exactly representable
+  EXPECT_EQ(7.5, APFloat(APFloat::Float6E2M3FN(), "7.5").convertToDouble());
+  // Round down to maximum value
+  EXPECT_EQ(7.5, APFloat(APFloat::Float6E2M3FN(), "32").convertToDouble());
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+  EXPECT_DEATH(APFloat(APFloat::Float6E2M3FN(), "inf"),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat(APFloat::Float6E2M3FN(), "nan"),
+               "This floating point format does not support NaN");
+#endif
+#endif
+
+  EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "0").isPosZero());
+  EXPECT_TRUE(APFloat(APFloat::Float6E2M3FN(), "-0").isNegZero());
+}
+
+TEST(APFloatTest, ConvertE3M2FToE2M3F) {
+  bool losesInfo;
+  APFloat test(APFloat::Float6E3M2FN(), "1.0");
+  APFloat::opStatus status = test.convert(
+      APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(1.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  test = APFloat(APFloat::Float6E3M2FN(), "0.0");
+  status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  // Test overflow
+  losesInfo = false;
+  test = APFloat(APFloat::Float6E3M2FN(), "28");
+  status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(7.5f, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opInexact);
+
+  // Test underflow
+  test = APFloat(APFloat::Float6E3M2FN(), ".0625");
+  status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0., test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+
+  // Testing inexact rounding to denormal number
+  losesInfo = false;
+  test = APFloat(APFloat::Float6E3M2FN(), "0.1875");
+  status = test.convert(APFloat::Float6E2M3FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0.25, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opUnderflow | APFloat::opInexact);
+}
+
+TEST(APFloatTest, ConvertE2M3FToE3M2F) {
+  bool losesInfo;
+  APFloat test(APFloat::Float6E2M3FN(), "1.0");
+  APFloat::opStatus status = test.convert(
+      APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven, &losesInfo);
+  EXPECT_EQ(1.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  test = APFloat(APFloat::Float6E2M3FN(), "0.0");
+  status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(0.0f, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  test = APFloat(APFloat::Float6E2M3FN(), ".125");
+  status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(.125, test.convertToFloat());
+  EXPECT_FALSE(losesInfo);
+  EXPECT_EQ(status, APFloat::opOK);
+
+  // Test inexact rounding
+  losesInfo = false;
+  test = APFloat(APFloat::Float6E2M3FN(), "7.5");
+  status = test.convert(APFloat::Float6E3M2FN(), APFloat::rmNearestTiesToEven,
+                        &losesInfo);
+  EXPECT_EQ(8, test.convertToFloat());
+  EXPECT_TRUE(losesInfo);
+  EXPECT_EQ(status, APFloat::opInexact);
+}
+
+TEST(APFloatTest, Float6E3M2FNNext) {
+  APFloat test(APFloat::Float6E3M2FN(), APFloat::uninitialized);
+  APFloat expected(APFloat::Float6E3M2FN(), APFloat::uninitialized);
+
+  // 1. NextUp of largest bit pattern is the same
+  test = APFloat::getLargest(APFloat::Float6E3M2FN());
+  expected = APFloat::getLargest(APFloat::Float6E3M2FN());
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 2. NextUp of smallest negative denormal is -0
+  test = APFloat::getSmallest(APFloat::Float6E3M2FN(), true);
+  expected = APFloat::getZero(APFloat::Float6E3M2FN(), true);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_TRUE(test.isNegZero());
+  EXPECT_FALSE(test.isPosZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 3. nextDown of negative of largest value is the same
+  test = APFloat::getLargest(APFloat::Float6E3M2FN(), true);
+  expected = test;
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_FALSE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 4. nextDown of +0 is smallest negative denormal
+  test = APFloat::getZero(APFloat::Float6E3M2FN(), false);
+  expected = APFloat::getSmallest(APFloat::Float6E3M2FN(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
+TEST(APFloatTest, Float6E2M3FNNext) {
+  APFloat test(APFloat::Float6E2M3FN(), APFloat::uninitialized);
+  APFloat expected(APFloat::Float6E2M3FN(), APFloat::uninitialized);
+
+  // 1. NextUp of largest bit pattern is the same
+  test = APFloat::getLargest(APFloat::Float6E2M3FN());
+  expected = APFloat::getLargest(APFloat::Float6E2M3FN());
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 2. NextUp of smallest negative denormal is -0
+  test = APFloat::getSmallest(APFloat::Float6E2M3FN(), true);
+  expected = APFloat::getZero(APFloat::Float6E2M3FN(), true);
+  EXPECT_EQ(test.next(false), APFloat::opOK);
+  EXPECT_TRUE(test.isNegZero());
+  EXPECT_FALSE(test.isPosZero());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 3. nextDown of negative of largest value is the same
+  test = APFloat::getLargest(APFloat::Float6E2M3FN(), true);
+  expected = test;
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isInfinity());
+  EXPECT_FALSE(test.isZero());
+  EXPECT_FALSE(test.isNaN());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+
+  // 4. nextDown of +0 is smallest negative denormal
+  test = APFloat::getZero(APFloat::Float6E2M3FN(), false);
+  expected = APFloat::getSmallest(APFloat::Float6E2M3FN(), true);
+  EXPECT_EQ(test.next(true), APFloat::opOK);
+  EXPECT_FALSE(test.isZero());
+  EXPECT_TRUE(test.isDenormal());
+  EXPECT_TRUE(test.bitwiseIsEqual(expected));
+}
+
+#ifdef GTEST_HAS_DEATH_TEST
+#ifndef NDEBUG
+TEST(APFloatTest, Float6E3M2FNGetInfNaN) {
+  EXPECT_DEATH(APFloat::getInf(APFloat::Float6E3M2FN()),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E3M2FN()),
+               "This floating point format does not support NaN");
+}
+
+TEST(APFloatTest, Float6E2M3FNGetInfNaN) {
+  EXPECT_DEATH(APFloat::getInf(APFloat::Float6E2M3FN()),
+               "This floating point format does not support Inf");
+  EXPECT_DEATH(APFloat::getNaN(APFloat::Float6E2M3FN()),
+               "This floating point format does not support NaN");
+}
+#endif
+#endif
+
+TEST(APFloatTest, Float6E3M2FNToDouble) {
+  APFloat One(APFloat::Float6E3M2FN(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float6E3M2FN(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), false);
+  EXPECT_EQ(28., PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), true);
+  EXPECT_EQ(-28., NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
+  EXPECT_EQ(0x1p-2, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), true);
+  EXPECT_EQ(-0x1p-2, NegSmallest.convertToDouble());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x0.1p0, SmallestDenorm.convertToDouble());
+}
+
+TEST(APFloatTest, Float6E2M3FNToDouble) {
+  APFloat One(APFloat::Float6E2M3FN(), "1.0");
+  EXPECT_EQ(1.0, One.convertToDouble());
+  APFloat Two(APFloat::Float6E2M3FN(), "2.0");
+  EXPECT_EQ(2.0, Two.convertToDouble());
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), false);
+  EXPECT_EQ(7.5, PosLargest.convertToDouble());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), true);
+  EXPECT_EQ(-7.5, NegLargest.convertToDouble());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false);
+  EXPECT_EQ(0x1p0, PosSmallest.convertToDouble());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), true);
+  EXPECT_EQ(-0x1p0, NegSmallest.convertToDouble());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToDouble());
+}
+
+TEST(APFloatTest, Float6E3M2FNToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float6E3M2FN());
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float6E3M2FN(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+  APFloat One(APFloat::Float6E3M2FN(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float6E3M2FN(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), false);
+  EXPECT_EQ(28., PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float6E3M2FN(), true);
+  EXPECT_EQ(-28, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), false);
+  EXPECT_EQ(0x1p-2, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E3M2FN(), true);
+  EXPECT_EQ(-0x1p-2, NegSmallest.convertToFloat());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E3M2FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x0.1p0, SmallestDenorm.convertToFloat());
+}
+
+TEST(APFloatTest, Float6E2M3FNToFloat) {
+  APFloat PosZero = APFloat::getZero(APFloat::Float6E2M3FN());
+  APFloat PosZeroToFloat(PosZero.convertToFloat());
+  EXPECT_TRUE(PosZeroToFloat.isPosZero());
+  APFloat NegZero = APFloat::getZero(APFloat::Float6E2M3FN(), true);
+  APFloat NegZeroToFloat(NegZero.convertToFloat());
+  EXPECT_TRUE(NegZeroToFloat.isNegZero());
+
+  APFloat One(APFloat::Float6E2M3FN(), "1.0");
+  EXPECT_EQ(1.0F, One.convertToFloat());
+  APFloat Two(APFloat::Float6E2M3FN(), "2.0");
+  EXPECT_EQ(2.0F, Two.convertToFloat());
+
+  APFloat PosLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), false);
+  EXPECT_EQ(7.5, PosLargest.convertToFloat());
+  APFloat NegLargest = APFloat::getLargest(APFloat::Float6E2M3FN(), true);
+  EXPECT_EQ(-7.5, NegLargest.convertToFloat());
+  APFloat PosSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), false);
+  EXPECT_EQ(0x1p0, PosSmallest.convertToFloat());
+  APFloat NegSmallest =
+      APFloat::getSmallestNormalized(APFloat::Float6E2M3FN(), true);
+  EXPECT_EQ(-0x1p0, NegSmallest.convertToFloat());
+
+  APFloat SmallestDenorm = APFloat::getSmallest(APFloat::Float6E2M3FN(), false);
+  EXPECT_TRUE(SmallestDenorm.isDenormal());
+  EXPECT_EQ(0x0.2p0, SmallestDenorm.convertToFloat());
+}
 } // namespace
diff --git a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
index 24f4f11db9a8b9..f6a053792f8529 100644
--- a/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
+++ b/llvm/unittests/Analysis/IRSimilarityIdentifierTest.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/IRSimilarityIdentifier.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -23,11 +22,6 @@
 using namespace llvm;
 using namespace IRSimilarity;
 
-extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
-extern bool WriteNewDbgInfoFormatToBitcode;
-extern cl::opt<bool> WriteNewDbgInfoFormat;
-
 static std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
                                               StringRef ModuleStr) {
   SMDiagnostic Err;
@@ -1312,18 +1306,19 @@ TEST(IRInstructionMapper, CallBrInstIllegal) {
   ASSERT_GT(UnsignedVec[0], Mapper.IllegalInstrNumber);
 }
 
-// Checks that an debuginfo records are mapped to be invisible. Since they
+// Checks that an debuginfo intrinsics are mapped to be invisible.  Since they
 // do not semantically change the program, they can be recognized as similar.
 TEST(IRInstructionMapper, DebugInfoInvisible) {
   StringRef ModuleString = R"(
                           define i32 @f(i32 %a, i32 %b) {
                           then:
-                            %0 = add i32 %a, %b
-                              #dbg_value(i32 0, !0, !0, !0)
-                            %1 = add i32 %a, %b
+                            %0 = add i32 %a, %b                    
+                            call void @llvm.dbg.value(metadata !0)
+                            %1 = add i32 %a, %b     
                             ret i32 0
                           }
 
+                          declare void @llvm.dbg.value(metadata)
                           !0 = distinct !{!"test\00", i32 10})";
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleString);
@@ -1919,19 +1914,19 @@ TEST(IRSimilarityCandidate, CheckRegionsDifferentTypes) {
   ASSERT_FALSE(longSimCandCompare(InstrList));
 }
 
-// Check that debug records do not impact similarity. They are marked as
+// Check that debug instructions do not impact similarity. They are marked as
 // invisible.
 TEST(IRSimilarityCandidate, IdenticalWithDebug) {
   StringRef ModuleString = R"(
                           define i32 @f(i32 %a, i32 %b) {
                           bb0:
                              %0 = add i32 %a, %b
-                               #dbg_value(i32 0, !0, !0, !0)
+                             call void @llvm.dbg.value(metadata !0)
                              %1 = add i32 %b, %a
                              ret i32 0
                           bb1:
                              %2 = add i32 %a, %b
-                               #dbg_value(i32 1, !1, !1, !1)
+                             call void @llvm.dbg.value(metadata !1)
                              %3 = add i32 %b, %a
                              ret i32 0
                           bb2:
@@ -1940,6 +1935,7 @@ TEST(IRSimilarityCandidate, IdenticalWithDebug) {
                              ret i32 0       
                           }
 
+                          declare void @llvm.dbg.value(metadata)
                           !0 = distinct !{!"test\00", i32 10}
                           !1 = distinct !{!"test\00", i32 11})";
   LLVMContext Context;
diff --git a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
index 91a0745a0cc76e..f873bbd4293af5 100644
--- a/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
+++ b/llvm/unittests/IR/BasicBlockDbgInfoTest.cpp
@@ -25,6 +25,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
 static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
   SMDiagnostic Err;
   std::unique_ptr<Module> Mod = parseAssemblyString(IR, Err, C);
@@ -42,6 +44,8 @@ namespace {
 // by DbgVariableRecords, the dbg.value replacement.
 TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -68,6 +72,8 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
+  // Convert the module to "new" form debug-info.
+  M->convertToNewDbgValues();
   // Fetch the entry block.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
 
@@ -97,10 +103,16 @@ TEST(BasicBlockDbgInfoTest, InsertAfterSelf) {
   EXPECT_TRUE(RetInst->hasDbgRecords());
   auto Range2 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(Range2.begin(), Range2.end()), 1u);
+
+  M->convertFromNewDbgValues();
+
+  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"---(
     define dso_local void @func() #0 !dbg !10 {
       %1 = alloca i32, align 4
@@ -138,6 +150,8 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   )---");
   ASSERT_TRUE(M);
 
+  M->convertToNewDbgValues();
+
   Function *F = M->getFunction("func");
 
   BasicBlock &BB = F->getEntryBlock();
@@ -147,10 +161,14 @@ TEST(BasicBlockDbgInfoTest, SplitBasicBlockBefore) {
   BasicBlock &BBBefore = F->getEntryBlock();
   auto I2 = std::prev(BBBefore.end(), 2);
   ASSERT_TRUE(I2->hasDbgRecords());
+
+  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, MarkerOperations) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -178,6 +196,8 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
 
   // Fetch the entry block,
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
+  // Convert the module to "new" form debug-info.
+  M->convertToNewDbgValues();
   EXPECT_EQ(BB.size(), 2u);
 
   // Fetch out our two markers,
@@ -275,10 +295,14 @@ TEST(BasicBlockDbgInfoTest, MarkerOperations) {
 
   // Teardown,
   Instr1->insertBefore(BB, BB.begin());
+
+  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -308,6 +332,8 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
   // Test that the movement of debug-data when using moveBefore etc and
   // insertBefore etc are governed by the "head" bit of iterators.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
+  // Convert the module to "new" form debug-info.
+  M->convertToNewDbgValues();
 
   // Test that the head bit behaves as expected: it should be set when the
   // code wants the _start_ of the block, but not otherwise.
@@ -378,10 +404,14 @@ TEST(BasicBlockDbgInfoTest, HeadBitOperations) {
               DInst->DebugMarker->StoredDbgRecords.empty());
   EXPECT_FALSE(CInst->DebugMarker->StoredDbgRecords.empty());
   EXPECT_EQ(&*BB.begin(), CInst);
+
+  UseNewDbgInfoFormat = false;
 }
 
 TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -411,6 +441,8 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   // Check that DbgVariableRecords can be accessed from Instructions without
   // digging into the depths of DbgMarkers.
   BasicBlock &BB = M->getFunction("f")->getEntryBlock();
+  // Convert the module to "new" form debug-info.
+  M->convertToNewDbgValues();
 
   Instruction *BInst = &*BB.begin();
   Instruction *CInst = BInst->getNextNode();
@@ -451,6 +483,8 @@ TEST(BasicBlockDbgInfoTest, InstrDbgAccess) {
   CInst->dropOneDbgRecord(DVR1);
   EXPECT_FALSE(CInst->hasDbgRecords());
   EXPECT_EQ(CInst->DebugMarker->StoredDbgRecords.size(), 0u);
+
+  UseNewDbgInfoFormat = false;
 }
 
 /* Let's recall the big illustration from BasicBlock::spliceDebugInfo:
@@ -543,7 +577,9 @@ class DbgSpliceTest : public ::testing::Test {
   DbgVariableRecord *DVRA, *DVRB, *DVRConst;
 
   void SetUp() override {
+    UseNewDbgInfoFormat = true;
     M = parseIR(C, SpliceTestIR.c_str());
+    M->convertToNewDbgValues();
 
     BBEntry = &M->getFunction("f")->getEntryBlock();
     BBExit = BBEntry->getNextNode();
@@ -563,6 +599,8 @@ class DbgSpliceTest : public ::testing::Test {
         cast<DbgVariableRecord>(&*CInst->DebugMarker->StoredDbgRecords.begin());
   }
 
+  void TearDown() override { UseNewDbgInfoFormat = false; }
+
   bool InstContainsDbgVariableRecord(Instruction *I, DbgVariableRecord *DVR) {
     for (DbgRecord &D : I->getDbgRecordRange()) {
       if (&D == DVR) {
@@ -1149,6 +1187,8 @@ metadata !9, metadata !DIExpression()), !dbg !11 Dest      %c = add i16 %b, 1,
 // then the trailing DbgVariableRecords should get flushed back out.
 TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1179,6 +1219,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
+  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1193,6 +1234,8 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
   Instruction *BInst = &*Entry.begin();
   ASSERT_TRUE(BInst->DebugMarker);
   EXPECT_EQ(BInst->DebugMarker->StoredDbgRecords.size(), 1u);
+
+  UseNewDbgInfoFormat = false;
 }
 
 // When we remove instructions from the program, adjacent DbgVariableRecords
@@ -1201,6 +1244,8 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceTrailing) {
 // dbg.values. Test that this can be replicated correctly by DbgVariableRecords
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1228,6 +1273,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
 )");
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
+  M->convertToNewDbgValues();
 
   // Fetch the relevant instructions from the converted function.
   Instruction *SubInst = &*Entry.begin();
@@ -1270,12 +1316,16 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsert) {
   EXPECT_EQ(std::distance(R4.begin(), R4.end()), 1u);
   auto R5 = RetInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R5.begin(), R5.end()), 1u);
+
+  UseNewDbgInfoFormat = false;
 }
 
 // Test instruction removal and re-insertion, this time with one
 // DbgVariableRecord that should hop up one instruction.
 TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1302,6 +1352,7 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
 )");
 
   BasicBlock &Entry = M->getFunction("f")->getEntryBlock();
+  M->convertToNewDbgValues();
 
   // Fetch the relevant instructions from the converted function.
   Instruction *SubInst = &*Entry.begin();
@@ -1340,6 +1391,8 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
   EXPECT_FALSE(RetInst->hasDbgRecords());
   auto R3 = AddInst->getDbgRecordRange();
   EXPECT_EQ(std::distance(R3.begin(), R3.end()), 1u);
+
+  UseNewDbgInfoFormat = false;
 }
 
 // Similar to the above, what if we splice into an empty block with debug-info,
@@ -1348,6 +1401,8 @@ TEST(BasicBlockDbgInfoTest, RemoveInstAndReinsertForOneDbgVariableRecord) {
 // of the i16 0 dbg.value.
 TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1381,6 +1436,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
+  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1407,12 +1463,16 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty1) {
 
   // No trailing DbgVariableRecords in the entry block now.
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
+
+  UseNewDbgInfoFormat = false;
 }
 
 // Similar test again, but this time: splice the contents of exit into entry,
 // with the intention of leaving the first dbg.value (i16 0) behind.
 TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1446,6 +1506,7 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
+  M->convertToNewDbgValues();
 
   // Begin by forcing entry block to have dangling DbgVariableRecord.
   Entry.getTerminator()->eraseFromParent();
@@ -1476,12 +1537,16 @@ TEST(BasicBlockDbgInfoTest, DbgSpliceToEmpty2) {
   EXPECT_FALSE(Exit.getTrailingDbgRecords()->empty());
   Exit.getTrailingDbgRecords()->eraseFromParent();
   Exit.deleteTrailingDbgRecords();
+
+  UseNewDbgInfoFormat = false;
 }
 
 // What if we moveBefore end() -- there might be no debug-info there, in which
 // case we shouldn't crash.
 TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   LLVMContext C;
+  UseNewDbgInfoFormat = true;
+
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
     entry:
@@ -1511,6 +1576,7 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   Function &F = *M->getFunction("f");
   BasicBlock &Entry = F.getEntryBlock();
   BasicBlock &Exit = *Entry.getNextNode();
+  M->convertToNewDbgValues();
 
   // Move the return to the end of the entry block.
   Instruction *Br = Entry.getTerminator();
@@ -1523,6 +1589,8 @@ TEST(BasicBlockDbgInfoTest, DbgMoveToEnd) {
   EXPECT_EQ(Entry.getTrailingDbgRecords(), nullptr);
   EXPECT_EQ(Exit.getTrailingDbgRecords(), nullptr);
   EXPECT_FALSE(Ret->hasDbgRecords());
+
+  UseNewDbgInfoFormat = false;
 }
 
 } // End anonymous namespace.
diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp
index be5ea6df18bc32..fef85d219b6c56 100644
--- a/llvm/unittests/IR/DebugInfoTest.cpp
+++ b/llvm/unittests/IR/DebugInfoTest.cpp
@@ -156,7 +156,7 @@ TEST(StripTest, LoopMetadata) {
   EXPECT_FALSE(BrokenDebugInfo);
 }
 
-TEST(MetadataTest, DeleteInstUsedByDbgRecord) {
+TEST(MetadataTest, DeleteInstUsedByDbgValue) {
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -187,13 +187,12 @@ TEST(MetadataTest, DeleteInstUsedByDbgRecord) {
 
   // Find the dbg.value using %b.
   SmallVector<DbgValueInst *, 1> DVIs;
-  SmallVector<DbgVariableRecord *, 1> DVRs;
-  findDbgValues(DVIs, &I, &DVRs);
+  findDbgValues(DVIs, &I);
 
   // Delete %b. The dbg.value should now point to undef.
   I.eraseFromParent();
-  EXPECT_EQ(DVRs[0]->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(isa<UndefValue>(DVRs[0]->getValue(0)));
+  EXPECT_EQ(DVIs[0]->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(isa<UndefValue>(DVIs[0]->getValue(0)));
 }
 
 TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
@@ -231,8 +230,8 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
 
   // Get the dbg.declare.
   Function &F = *cast<Function>(M->getNamedValue("fun"));
-  DbgVariableRecord *DbgDeclare =
-      cast<DbgVariableRecord>(&*F.front().front().getDbgRecordRange().begin());
+  DbgVariableIntrinsic *DbgDeclare =
+      cast<DbgVariableIntrinsic>(&F.front().front());
   // Check that this form counts as a "no location" marker.
   EXPECT_TRUE(DbgDeclare->isKillLocation());
 }
@@ -240,9 +239,6 @@ TEST(DbgVariableIntrinsic, EmptyMDIsKillLocation) {
 // Duplicate of above test, but in DbgVariableRecord representation.
 TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
   LLVMContext C;
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = true;
-
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       %b = add i16 %a, 1, !dbg !11
@@ -268,7 +264,10 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = true;
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
+  M->convertToNewDbgValues();
 
   // Find the DbgVariableRecords using %b.
   SmallVector<DbgValueInst *, 2> DVIs;
@@ -290,8 +289,6 @@ TEST(MetadataTest, DeleteInstUsedByDbgVariableRecord) {
 // Ensure that the order of dbg.value intrinsics returned by findDbgValues, and
 // their corresponding DbgVariableRecord representation, are consistent.
 TEST(MetadataTest, OrderingOfDbgVariableRecords) {
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
   LLVMContext C;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -319,6 +316,8 @@ TEST(MetadataTest, OrderingOfDbgVariableRecords) {
     !12 = !DILocalVariable(name: "bar", scope: !6, file: !1, line: 1, type: !10)
 )");
 
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = true;
   Instruction &I = *M->getFunction("f")->getEntryBlock().getFirstNonPHI();
 
   SmallVector<DbgValueInst *, 2> DVIs;
@@ -517,15 +516,14 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) {
   Value *V1 = Fun.getArg(0);
   Value *P1 = Fun.getArg(1);
   Value *P2 = Fun.getArg(2);
-  DbgVariableRecord *DbgAssign = cast<DbgVariableRecord>(
-      &*Fun.front().front().getDbgRecordRange().begin());
-  ASSERT_TRUE(V1 == DbgAssign->getVariableLocationOp(0));
-  ASSERT_TRUE(P1 == DbgAssign->getAddress());
+  DbgAssignIntrinsic *DAI = cast<DbgAssignIntrinsic>(Fun.begin()->begin());
+  ASSERT_TRUE(V1 == DAI->getVariableLocationOp(0));
+  ASSERT_TRUE(P1 == DAI->getAddress());
 
 #define TEST_REPLACE(Old, New, ExpectedValue, ExpectedAddr)                    \
-  DbgAssign->replaceVariableLocationOp(Old, New);                              \
-  EXPECT_EQ(DbgAssign->getVariableLocationOp(0), ExpectedValue);               \
-  EXPECT_EQ(DbgAssign->getAddress(), ExpectedAddr);
+  DAI->replaceVariableLocationOp(Old, New);                                    \
+  EXPECT_EQ(DAI->getVariableLocationOp(0), ExpectedValue);                     \
+  EXPECT_EQ(DAI->getAddress(), ExpectedAddr);
 
   // Replace address only.
   TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2);
@@ -536,8 +534,8 @@ TEST(DbgAssignIntrinsicTest, replaceVariableLocationOp) {
 
   // Replace address only, value uses a DIArgList.
   // Value = {DIArgList(V1)}, Addr = P1.
-  DbgAssign->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1)));
-  DbgAssign->setExpression(DIExpression::get(
+  DAI->setRawLocation(DIArgList::get(C, ValueAsMetadata::get(V1)));
+  DAI->setExpression(DIExpression::get(
       C, {dwarf::DW_OP_LLVM_arg, 0, dwarf::DW_OP_stack_value}));
   TEST_REPLACE(/*Old*/ P1, /*New*/ P2, /*Value*/ V1, /*Address*/ P2);
 #undef TEST_REPLACE
@@ -641,11 +639,11 @@ TEST(AssignmentTrackingTest, Utils) {
   //
   // Check there are two llvm.dbg.assign intrinsics linked to Alloca.
   auto CheckFun1Mapping = [&Alloca]() {
-    auto Markers = at::getDVRAssignmentMarkers(&Alloca);
+    auto Markers = at::getAssignmentMarkers(&Alloca);
     EXPECT_TRUE(std::distance(Markers.begin(), Markers.end()) == 2);
     // Check those two entries are distinct.
-    DbgVariableRecord *First = *Markers.begin();
-    DbgVariableRecord *Second = *std::next(Markers.begin());
+    DbgAssignIntrinsic *First = *Markers.begin();
+    DbgAssignIntrinsic *Second = *std::next(Markers.begin());
     EXPECT_NE(First, Second);
 
     // Check that we can get back to Alloca from each llvm.dbg.assign.
@@ -681,7 +679,7 @@ TEST(AssignmentTrackingTest, Utils) {
   DIAssignID *Fun2ID = cast_or_null<DIAssignID>(
       Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID));
   EXPECT_NE(New, Fun2ID);
-  auto Fun2Markers = at::getDVRAssignmentMarkers(&Fun2Alloca);
+  auto Fun2Markers = at::getAssignmentMarkers(&Fun2Alloca);
   ASSERT_TRUE(std::distance(Fun2Markers.begin(), Fun2Markers.end()) == 1);
   auto Fun2Insts = at::getAssignmentInsts(*Fun2Markers.begin());
   ASSERT_TRUE(std::distance(Fun2Insts.begin(), Fun2Insts.end()) == 1);
@@ -690,10 +688,10 @@ TEST(AssignmentTrackingTest, Utils) {
   // 3. Check that deleting dbg.assigns from a specific instruction works.
   Instruction &Fun3Alloca =
       *M->getFunction("fun3")->getEntryBlock().getFirstNonPHIOrDbg();
-  auto Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca);
+  auto Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca);
   ASSERT_TRUE(std::distance(Fun3Markers.begin(), Fun3Markers.end()) == 1);
   at::deleteAssignmentMarkers(&Fun3Alloca);
-  Fun3Markers = at::getDVRAssignmentMarkers(&Fun3Alloca);
+  Fun3Markers = at::getAssignmentMarkers(&Fun3Alloca);
   EXPECT_EQ(Fun3Markers.empty(), true);
 
   // 4. Check that deleting works and applies only to the target function.
@@ -704,7 +702,7 @@ TEST(AssignmentTrackingTest, Utils) {
   // llvm.dbg.assign.
   EXPECT_EQ(Fun2ID, cast_or_null<DIAssignID>(
                         Fun2Alloca.getMetadata(LLVMContext::MD_DIAssignID)));
-  EXPECT_FALSE(at::getDVRAssignmentMarkers(&Fun2Alloca).empty());
+  EXPECT_FALSE(at::getAssignmentMarkers(&Fun2Alloca).empty());
 }
 
 TEST(IRBuilder, GetSetInsertionPointWithEmptyBasicBlock) {
@@ -790,12 +788,12 @@ TEST(AssignmentTrackingTest, InstrMethods) {
   // Use SetVectors to check that the attachments and markers are unique
   // (another test requirement).
   SetVector<Metadata *> OrigIDs;
-  SetVector<DbgVariableRecord *> Markers;
+  SetVector<DbgAssignIntrinsic *> Markers;
   for (const Instruction *SI : Stores) {
     Metadata *ID = SI->getMetadata(LLVMContext::MD_DIAssignID);
     ASSERT_TRUE(OrigIDs.insert(ID));
     ASSERT_TRUE(ID != nullptr);
-    auto Range = at::getDVRAssignmentMarkers(SI);
+    auto Range = at::getAssignmentMarkers(SI);
     ASSERT_TRUE(std::distance(Range.begin(), Range.end()) == 1);
     ASSERT_TRUE(Markers.insert(*Range.begin()));
   }
@@ -888,8 +886,6 @@ TEST(AssignmentTrackingTest, InstrMethods) {
 // dbg.values that have been converted to a non-instruction format.
 TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   LLVMContext C;
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
       call void @llvm.dbg.value(metadata i16 %a, metadata !9, metadata !DIExpression()), !dbg !11
@@ -1064,14 +1060,14 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) {
   // The record of those trailing DbgVariableRecords would dangle and cause an
   // assertion failure if it lived until the end of the LLVMContext.
   ExitBlock->deleteTrailingDbgRecords();
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   LLVMContext C;
 
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
+  // For the purpose of this test, set and un-set the command line option
+  // corresponding to UseNewDbgInfoFormat.
+  UseNewDbgInfoFormat = true;
 
   std::unique_ptr<Module> M = parseIR(C, R"(
     define i16 @f(i16 %a) !dbg !6 {
@@ -1102,11 +1098,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
     !11 = !DILocation(line: 1, column: 1, scope: !6)
 )");
 
-  // For the purpose of this test, set and un-set the command line option
-  // corresponding to UseNewDbgInfoFormat, but only after parsing, to ensure
-  // that the IR starts off in the old format.
-  UseNewDbgInfoFormat = true;
-
   // Check that the conversion routines and utilities between dbg.value
   // debug-info format and DbgVariableRecords works.
   Function *F = M->getFunction("f");
@@ -1211,7 +1202,7 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) {
   EXPECT_EQ(DVI2->getVariable(), DLV2);
   EXPECT_EQ(DVI2->getExpression(), Expr2);
 
-  UseNewDbgInfoFormat = OldDbgValueMode;
+  UseNewDbgInfoFormat = false;
 }
 
 // Test that the hashing function for DISubprograms representing methods produce
diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp
index ff96df85812002..2001df090aed53 100644
--- a/llvm/unittests/IR/IRBuilderTest.cpp
+++ b/llvm/unittests/IR/IRBuilderTest.cpp
@@ -994,17 +994,17 @@ TEST_F(IRBuilderTest, DIBuilder) {
     EXPECT_TRUE(verifyModule(*M));
   };
 
-  // Test in new-debug mode.
-  EXPECT_TRUE(M->IsNewDbgInfoFormat);
+  // Test in old-debug mode.
+  EXPECT_FALSE(M->IsNewDbgInfoFormat);
   RunTest();
 
-  // Test in old-debug mode.
-  // Reset the test then call convertFromNewDbgValues to flip the flag
+  // Test in new-debug mode.
+  // Reset the test then call convertToNewDbgValues to flip the flag
   // on the test's Module, Function and BasicBlock.
   TearDown();
   SetUp();
-  M->convertFromNewDbgValues();
-  EXPECT_FALSE(M->IsNewDbgInfoFormat);
+  M->convertToNewDbgValues();
+  EXPECT_TRUE(M->IsNewDbgInfoFormat);
   RunTest();
 }
 
diff --git a/llvm/unittests/IR/InstructionsTest.cpp b/llvm/unittests/IR/InstructionsTest.cpp
index 9748650e777808..cbb0d9d8657172 100644
--- a/llvm/unittests/IR/InstructionsTest.cpp
+++ b/llvm/unittests/IR/InstructionsTest.cpp
@@ -26,14 +26,11 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gmock/gmock-matchers.h"
 #include "gtest/gtest.h"
 #include <memory>
 
-extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-
 namespace llvm {
 namespace {
 
@@ -1463,8 +1460,6 @@ TEST(InstructionsTest, GetSplat) {
 
 TEST(InstructionsTest, SkipDebug) {
   LLVMContext C;
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
   std::unique_ptr<Module> M = parseIR(C,
                                       R"(
       declare void @llvm.dbg.value(metadata, metadata, metadata)
@@ -1500,7 +1495,6 @@ TEST(InstructionsTest, SkipDebug) {
 
   // After the terminator, there are no non-debug instructions.
   EXPECT_EQ(nullptr, Term->getNextNonDebugInstruction());
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(InstructionsTest, PhiMightNotBeFPMathOperator) {
diff --git a/llvm/unittests/IR/ValueTest.cpp b/llvm/unittests/IR/ValueTest.cpp
index 33a86d510d45cb..246c2fc7fe4063 100644
--- a/llvm/unittests/IR/ValueTest.cpp
+++ b/llvm/unittests/IR/ValueTest.cpp
@@ -13,7 +13,6 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSlotTracker.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 using namespace llvm;
@@ -256,8 +255,6 @@ TEST(ValueTest, getLocalSlotDeath) {
 TEST(ValueTest, replaceUsesOutsideBlock) {
   // Check that Value::replaceUsesOutsideBlock(New, BB) replaces uses outside
   // BB, including dbg.* uses of MetadataAsValue(ValueAsMetadata(this)).
-  bool OldDbgValueMode = UseNewDbgInfoFormat;
-  UseNewDbgInfoFormat = false;
   const auto *IR = R"(
     define i32 @f() !dbg !6 {
     entry:
@@ -318,7 +315,6 @@ TEST(ValueTest, replaceUsesOutsideBlock) {
   // These users are outside Entry so should be changed.
   ASSERT_TRUE(ExitDbg->getValue(0) == cast<Value>(B));
   ASSERT_TRUE(Ret->getOperand(0) == cast<Value>(B));
-  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
@@ -363,6 +359,10 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   if (!M)
     Err.print("ValueTest", errs());
 
+  bool OldDbgValueMode = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = true;
+  M->convertToNewDbgValues();
+
   auto GetNext = [](auto *I) { return &*++I->getIterator(); };
 
   Function *F = M->getFunction("f");
@@ -389,6 +389,7 @@ TEST(ValueTest, replaceUsesOutsideBlockDbgVariableRecord) {
   EXPECT_TRUE(DVR1->getVariableLocationOp(0) == cast<Value>(A));
   // These users are outside Entry so should be changed.
   EXPECT_TRUE(DVR2->getVariableLocationOp(0) == cast<Value>(B));
+  UseNewDbgInfoFormat = OldDbgValueMode;
 }
 
 } // end anonymous namespace
diff --git a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
index d4e214c43e98c6..98b154f86c949f 100644
--- a/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
+++ b/llvm/unittests/Target/WebAssembly/WebAssemblyExceptionInfoTest.cpp
@@ -167,7 +167,7 @@ body: |
   WebAssemblyExceptionInfo WEI;
   MachineDominatorTree MDT;
   MachineDominanceFrontier MDF;
-  MDT.runOnMachineFunction(*MF);
+  MDT.calculate(*MF);
   MDF.getBase().analyze(MDT.getBase());
   WEI.recalculate(*MF, MDT, MDF);
 
@@ -342,7 +342,7 @@ body: |
   WebAssemblyExceptionInfo WEI;
   MachineDominatorTree MDT;
   MachineDominanceFrontier MDF;
-  MDT.runOnMachineFunction(*MF);
+  MDT.calculate(*MF);
   MDF.getBase().analyze(MDT.getBase());
   WEI.recalculate(*MF, MDT, MDF);
 
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 22a8a05ef3719a..128321fc3ae731 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -312,8 +312,6 @@ TEST(ParseArchString, AcceptsSupportedBaseISAsAndSetsXLenAndFLen) {
 }
 
 TEST(ParseArchString, RejectsUnrecognizedExtensionNamesByDefault) {
-  EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64ib", true).takeError()),
-            "unsupported standard user-level extension 'b'");
   EXPECT_EQ(
       toString(
           RISCVISAInfo::parseArchString("rv32i_zmadeup", true).takeError()),
@@ -326,9 +324,6 @@ TEST(ParseArchString, RejectsUnrecognizedExtensionNamesByDefault) {
       toString(
           RISCVISAInfo::parseArchString("rv64g_xmadeup", true).takeError()),
       "unsupported non-standard user-level extension 'xmadeup'");
-  EXPECT_EQ(
-      toString(RISCVISAInfo::parseArchString("rv64ib1p0", true).takeError()),
-      "unsupported standard user-level extension 'b'");
   EXPECT_EQ(
       toString(
           RISCVISAInfo::parseArchString("rv32i_zmadeup1p0", true).takeError()),
@@ -344,8 +339,7 @@ TEST(ParseArchString, RejectsUnrecognizedExtensionNamesByDefault) {
 }
 
 TEST(ParseArchString, IgnoresUnrecognizedExtensionNamesWithIgnoreUnknown) {
-  for (StringRef Input : {"rv32ib", "rv32i_zmadeup",
-                          "rv64i_smadeup", "rv64i_xmadeup"}) {
+  for (StringRef Input : {"rv32i_zmadeup", "rv64i_smadeup", "rv64i_xmadeup"}) {
     auto MaybeISAInfo = RISCVISAInfo::parseArchString(Input, true, false, true);
     ASSERT_THAT_EXPECTED(MaybeISAInfo, Succeeded());
     RISCVISAInfo &Info = **MaybeISAInfo;
@@ -913,6 +907,7 @@ R"(All available -march extensions for RISC-V
     f                    2.2
     d                    2.2
     c                    2.0
+    b                    1.0
     v                    1.0
     h                    1.0
     zic64b               1.0
diff --git a/llvm/unittests/Transforms/Utils/CloningTest.cpp b/llvm/unittests/Transforms/Utils/CloningTest.cpp
index 1d0d56a2099ceb..5e302d9c0a0d3e 100644
--- a/llvm/unittests/Transforms/Utils/CloningTest.cpp
+++ b/llvm/unittests/Transforms/Utils/CloningTest.cpp
@@ -844,9 +844,8 @@ TEST(CloneFunction, CloneFunctionWithInlinedSubprograms) {
   EXPECT_FALSE(verifyModule(*ImplModule, &errs()));
 
   // Check that DILexicalBlock of inlined function was not cloned.
-  auto DbgDeclareI = Func->begin()->begin()->getDbgRecordRange().begin();
-  auto ClonedDbgDeclareI =
-      ClonedFunc->begin()->begin()->getDbgRecordRange().begin();
+  auto DbgDeclareI = Func->begin()->begin();
+  auto ClonedDbgDeclareI = ClonedFunc->begin()->begin();
   const DebugLoc &DbgLoc = DbgDeclareI->getDebugLoc();
   const DebugLoc &ClonedDbgLoc = ClonedDbgDeclareI->getDebugLoc();
   EXPECT_NE(DbgLoc.get(), ClonedDbgLoc.get());
diff --git a/llvm/unittests/Transforms/Utils/LocalTest.cpp b/llvm/unittests/Transforms/Utils/LocalTest.cpp
index 316d59a9d22969..9b1176765c17f1 100644
--- a/llvm/unittests/Transforms/Utils/LocalTest.cpp
+++ b/llvm/unittests/Transforms/Utils/LocalTest.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -27,27 +26,6 @@
 
 using namespace llvm;
 
-extern llvm::cl::opt<bool> UseNewDbgInfoFormat;
-extern cl::opt<cl::boolOrDefault> PreserveInputDbgFormat;
-extern bool WriteNewDbgInfoFormatToBitcode;
-extern cl::opt<bool> WriteNewDbgInfoFormat;
-
-// Backup all of the existing settings that may be modified when
-// PreserveInputDbgFormat=true, so that when the test is finished we return them
-// (and the "preserve" setting) to their original values.
-static auto SaveDbgInfoFormat() {
-  return make_scope_exit(
-      [OldPreserveInputDbgFormat = PreserveInputDbgFormat.getValue(),
-       OldUseNewDbgInfoFormat = UseNewDbgInfoFormat.getValue(),
-       OldWriteNewDbgInfoFormatToBitcode = WriteNewDbgInfoFormatToBitcode,
-       OldWriteNewDbgInfoFormat = WriteNewDbgInfoFormat.getValue()] {
-        PreserveInputDbgFormat = OldPreserveInputDbgFormat;
-        UseNewDbgInfoFormat = OldUseNewDbgInfoFormat;
-        WriteNewDbgInfoFormatToBitcode = OldWriteNewDbgInfoFormatToBitcode;
-        WriteNewDbgInfoFormat = OldWriteNewDbgInfoFormat;
-      });
-}
-
 TEST(Local, RecursivelyDeleteDeadPHINodes) {
   LLVMContext C;
 
@@ -138,6 +116,7 @@ static std::unique_ptr<Module> parseIR(LLVMContext &C, const char *IR) {
 
 TEST(Local, ReplaceDbgDeclare) {
   LLVMContext C;
+
   // Original C source to get debug info for a local variable:
   // void f() { int x; }
   std::unique_ptr<Module> M = parseIR(C,
@@ -145,11 +124,11 @@ TEST(Local, ReplaceDbgDeclare) {
       define void @f() !dbg !8 {
       entry:
         %x = alloca i32, align 4
-          #dbg_declare(ptr %x, !11, !DIExpression(), !13)
-          #dbg_declare(ptr %x, !11, !DIExpression(), !13)
+        call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13
+        call void @llvm.dbg.declare(metadata i32* %x, metadata !11, metadata !DIExpression()), !dbg !13
         ret void, !dbg !14
       }
-
+      declare void @llvm.dbg.declare(metadata, metadata, metadata)
       !llvm.dbg.cu = !{!0}
       !llvm.module.flags = !{!3, !4}
       !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
@@ -172,18 +151,20 @@ TEST(Local, ReplaceDbgDeclare) {
   Instruction *Inst = &F->front().front();
   auto *AI = dyn_cast<AllocaInst>(Inst);
   ASSERT_TRUE(AI);
-
+  Inst = Inst->getNextNode()->getNextNode();
+  ASSERT_TRUE(Inst);
+  auto *DII = dyn_cast<DbgDeclareInst>(Inst);
+  ASSERT_TRUE(DII);
   Value *NewBase = Constant::getNullValue(PointerType::getUnqual(C));
   DIBuilder DIB(*M);
   replaceDbgDeclare(AI, NewBase, DIB, DIExpression::ApplyOffset, 0);
 
-  // There should be exactly two dbg.declares, attached to the terminator.
-  Inst = F->front().getTerminator();
-  ASSERT_TRUE(Inst);
-  EXPECT_TRUE(Inst->hasDbgRecords());
-  EXPECT_EQ(range_size(Inst->getDbgRecordRange()), 2u);
-  for (DbgVariableRecord &DVR : filterDbgVars(Inst->getDbgRecordRange()))
-    EXPECT_EQ(DVR.getAddress(), NewBase);
+  // There should be exactly two dbg.declares.
+  int Declares = 0;
+  for (const Instruction &I : F->front())
+    if (isa<DbgDeclareInst>(I))
+      Declares++;
+  EXPECT_EQ(2, Declares);
 }
 
 /// Build the dominator tree for the function and run the Test.
@@ -518,10 +499,11 @@ struct SalvageDebugInfoTest : ::testing::Test {
       entry:
         %x = add i32 0, 1
         %y = add i32 %x, 2
-          #dbg_value(i32 %x, !11, !DIExpression(), !13)
-          #dbg_value(i32 %y, !11, !DIExpression(), !13)
+        call void @llvm.dbg.value(metadata i32 %x, metadata !11, metadata !DIExpression()), !dbg !13
+        call void @llvm.dbg.value(metadata i32 %y, metadata !11, metadata !DIExpression()), !dbg !13
         ret void, !dbg !14
       }
+      declare void @llvm.dbg.value(metadata, metadata, metadata)
       !llvm.dbg.cu = !{!0}
       !llvm.module.flags = !{!3, !4}
       !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 6.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
@@ -544,47 +526,48 @@ struct SalvageDebugInfoTest : ::testing::Test {
     ASSERT_TRUE(F);
   }
 
-  bool doesDebugValueDescribeX(const DbgVariableRecord &DVR) {
-    if (DVR.getNumVariableLocationOps() != 1u)
+  bool doesDebugValueDescribeX(const DbgValueInst &DI) {
+    if (DI.getNumVariableLocationOps() != 1u)
       return false;
-    const auto &CI = *cast<ConstantInt>(DVR.getValue(0));
+    const auto &CI = *cast<ConstantInt>(DI.getValue(0));
     if (CI.isZero())
-      return DVR.getExpression()->getElements().equals(
+      return DI.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 1, dwarf::DW_OP_stack_value});
     else if (CI.isOneValue())
-      return DVR.getExpression()->getElements().empty();
+      return DI.getExpression()->getElements().empty();
     return false;
   }
 
-  bool doesDebugValueDescribeY(const DbgVariableRecord &DVR) {
-    if (DVR.getNumVariableLocationOps() != 1u)
+  bool doesDebugValueDescribeY(const DbgValueInst &DI) {
+    if (DI.getNumVariableLocationOps() != 1u)
       return false;
-    const auto &CI = *cast<ConstantInt>(DVR.getVariableLocationOp(0));
+    const auto &CI = *cast<ConstantInt>(DI.getVariableLocationOp(0));
     if (CI.isZero())
-      return DVR.getExpression()->getElements().equals(
+      return DI.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 3, dwarf::DW_OP_stack_value});
     else if (CI.isOneValue())
-      return DVR.getExpression()->getElements().equals(
+      return DI.getExpression()->getElements().equals(
           {dwarf::DW_OP_plus_uconst, 2, dwarf::DW_OP_stack_value});
     return false;
   }
 
   void verifyDebugValuesAreSalvaged() {
-    // The function should only contain debug values and a terminator.
-    EXPECT_EQ(F->size(), 1u);
-    EXPECT_TRUE(F->begin()->begin()->isTerminator());
-
     // Check that the debug values for %x and %y are preserved.
     bool FoundX = false;
     bool FoundY = false;
-    for (DbgVariableRecord &DVR :
-         filterDbgVars(F->begin()->begin()->getDbgRecordRange())) {
-      EXPECT_EQ(DVR.getVariable()->getName(), "x");
-      FoundX |= doesDebugValueDescribeX(DVR);
-      FoundY |= doesDebugValueDescribeY(DVR);
+    for (const Instruction &I : F->front()) {
+      auto DI = dyn_cast<DbgValueInst>(&I);
+      if (!DI) {
+        // The function should only contain debug values and a terminator.
+        ASSERT_TRUE(I.isTerminator());
+        continue;
+      }
+      EXPECT_EQ(DI->getVariable()->getName(), "x");
+      FoundX |= doesDebugValueDescribeX(*DI);
+      FoundY |= doesDebugValueDescribeY(*DI);
     }
-    EXPECT_TRUE(FoundX);
-    EXPECT_TRUE(FoundY);
+    ASSERT_TRUE(FoundX);
+    ASSERT_TRUE(FoundY);
   }
 };
 
@@ -607,12 +590,6 @@ TEST_F(SalvageDebugInfoTest, RecursiveBlockSimplification) {
 
 TEST(Local, wouldInstructionBeTriviallyDead) {
   LLVMContext Ctx;
-  // FIXME: PreserveInputDbgFormat is set to true because this test has
-  // been written to expect debug intrinsics rather than debug records.
-  // TODO: This test doesn't have a DbgRecord equivalent form so delete
-  // it when debug intrinsics are removed.
-  auto SettingGuard = SaveDbgInfoFormat();
-  PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE;
   std::unique_ptr<Module> M = parseIR(Ctx,
                                       R"(
     define dso_local void @fun() local_unnamed_addr #0 !dbg !9 {
@@ -706,10 +683,12 @@ TEST(Local, FindDbgUsers) {
                                       R"(
   define dso_local void @fun(ptr %a) #0 !dbg !11 {
   entry:
-      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
+    call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19
     ret void
   }
 
+  declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
   !llvm.dbg.cu = !{!0}
   !llvm.module.flags = !{!2, !3, !9}
   !llvm.ident = !{!10}
@@ -736,13 +715,9 @@ TEST(Local, FindDbgUsers) {
   verifyModule(*M, &errs(), &BrokenDebugInfo);
   ASSERT_FALSE(BrokenDebugInfo);
 
-  // Convert to debug intrinsics as we want to test findDbgUsers and
-  // findDbgValue's debug-intrinsic-finding code here.
-  // TODO: Remove this test when debug intrinsics are removed.
-  M->convertFromNewDbgValues();
-
   Function &Fun = *cast<Function>(M->getNamedValue("fun"));
   Value *Arg = Fun.getArg(0);
+
   SmallVector<DbgVariableIntrinsic *> Users;
   // Arg (%a) is used twice by a single dbg.assign. Check findDbgUsers returns
   // only 1 pointer to it rather than 2.
@@ -763,7 +738,7 @@ TEST(Local, FindDbgRecords) {
                                       R"(
   define dso_local void @fun(ptr %a) #0 !dbg !11 {
   entry:
-      #dbg_assign(ptr %a, !16, !DIExpression(), !15, ptr %a, !DIExpression(), !19)
+    call void @llvm.dbg.assign(metadata ptr %a, metadata !16, metadata !DIExpression(), metadata !15, metadata ptr %a, metadata !DIExpression()), !dbg !19
     ret void
   }
 
@@ -792,6 +767,9 @@ TEST(Local, FindDbgRecords) {
   bool BrokenDebugInfo = true;
   verifyModule(*M, &errs(), &BrokenDebugInfo);
   ASSERT_FALSE(BrokenDebugInfo);
+  bool NewDbgInfoFormat = UseNewDbgInfoFormat;
+  UseNewDbgInfoFormat = true;
+  M->convertToNewDbgValues();
 
   Function &Fun = *cast<Function>(M->getNamedValue("fun"));
   Value *Arg = Fun.getArg(0);
@@ -811,10 +789,12 @@ TEST(Local, FindDbgRecords) {
   findDbgValues(Vals, Arg, &Records);
   EXPECT_EQ(Vals.size(), 0u);
   EXPECT_EQ(Records.size(), 1u);
+  UseNewDbgInfoFormat = NewDbgInfoFormat;
 }
 
 TEST(Local, ReplaceAllDbgUsesWith) {
   using namespace llvm::dwarf;
+
   LLVMContext Ctx;
 
   // Note: The datalayout simulates Darwin/x86_64.
@@ -827,36 +807,39 @@ TEST(Local, ReplaceAllDbgUsesWith) {
     define void @f() !dbg !6 {
     entry:
       %a = add i32 0, 1, !dbg !15
+      call void @llvm.dbg.value(metadata i32 %a, metadata !9, metadata !DIExpression()), !dbg !15
 
-        #dbg_value(i32 %a, !9, !DIExpression(), !15)
       %b = add i64 0, 1, !dbg !16
+      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression()), !dbg !16
+      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul)), !dbg !16
+      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value)), !dbg !16
+      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_LLVM_fragment, 0, 8)), !dbg !16
+      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8)), !dbg !16
+      call void @llvm.dbg.value(metadata i64 %b, metadata !11, metadata !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8)), !dbg !16
 
-        #dbg_value(i64 %b, !11, !DIExpression(), !16)
-        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul), !16)
-        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value), !16)
-        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !16)
-        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_LLVM_fragment, 0, 8), !16)
-        #dbg_value(i64 %b, !11, !DIExpression(DW_OP_lit0, DW_OP_mul, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8), !16)
-      %c = inttoptr i64 0 to ptr, !dbg !17
+      %c = inttoptr i64 0 to i64*, !dbg !17
+      call void @llvm.dbg.declare(metadata i64* %c, metadata !13, metadata !DIExpression()), !dbg !17
 
-        #dbg_declare(ptr %c, !13, !DIExpression(), !17)
-      %d = inttoptr i64 0 to ptr, !dbg !18
+      %d = inttoptr i64 0 to i32*, !dbg !18
+      call void @llvm.dbg.declare(metadata i32* %d, metadata !20, metadata !DIExpression()), !dbg !18
 
-        #dbg_declare(ptr %d,  !20,  !DIExpression(), !18)
       %e = add <2 x i16> zeroinitializer, zeroinitializer
+      call void @llvm.dbg.value(metadata <2 x i16> %e, metadata !14, metadata !DIExpression()), !dbg !18
 
-        #dbg_value(<2 x i16> %e, !14, !DIExpression(), !18)
       %f = call i32 @escape(i32 0)
+      call void @llvm.dbg.value(metadata i32 %f, metadata !9, metadata !DIExpression()), !dbg !15
 
-        #dbg_value(i32 %f, !9, !DIExpression(), !15)
       %barrier = call i32 @escape(i32 0)
 
       %g = call i32 @escape(i32 %f)
+      call void @llvm.dbg.value(metadata i32 %g, metadata !9, metadata !DIExpression()), !dbg !15
 
-        #dbg_value(i32 %g, !9, !DIExpression(), !15)
       ret void, !dbg !19
     }
 
+    declare void @llvm.dbg.declare(metadata, metadata, metadata)
+    declare void @llvm.dbg.value(metadata, metadata, metadata)
+
     !llvm.dbg.cu = !{!0}
     !llvm.module.flags = !{!5}
 
@@ -911,47 +894,38 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   EXPECT_TRUE(replaceAllDbgUsesWith(D, C, C, DT));
 
   SmallVector<DbgVariableIntrinsic *, 2> CDbgVals;
-  SmallVector<DbgVariableRecord *, 2> CDbgRecords;
-  findDbgUsers(CDbgVals, &C, &CDbgRecords);
-  EXPECT_EQ(0U, CDbgVals.size());
-  EXPECT_EQ(2U, CDbgRecords.size());
-  EXPECT_TRUE(all_of(
-      CDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
+  findDbgUsers(CDbgVals, &C);
+  EXPECT_EQ(2U, CDbgVals.size());
+  EXPECT_TRUE(all_of(CDbgVals, [](DbgVariableIntrinsic *DII) {
+    return isa<DbgDeclareInst>(DII);
+  }));
 
   EXPECT_TRUE(replaceAllDbgUsesWith(C, D, D, DT));
 
   SmallVector<DbgVariableIntrinsic *, 2> DDbgVals;
-  SmallVector<DbgVariableRecord *, 2> DDbgRecords;
-  findDbgUsers(DDbgVals, &D, &DDbgRecords);
-  EXPECT_EQ(0U, DDbgVals.size());
-  EXPECT_EQ(2U, DDbgRecords.size());
-  EXPECT_TRUE(all_of(
-      DDbgRecords, [](DbgVariableRecord *DVR) { return DVR->isDbgDeclare(); }));
+  findDbgUsers(DDbgVals, &D);
+  EXPECT_EQ(2U, DDbgVals.size());
+  EXPECT_TRUE(all_of(DDbgVals, [](DbgVariableIntrinsic *DII) {
+    return isa<DbgDeclareInst>(DII);
+  }));
 
   // Introduce a use-before-def. Check that the dbg.value for %a is salvaged.
   EXPECT_TRUE(replaceAllDbgUsesWith(A, F_, F_, DT));
 
-  EXPECT_FALSE(A.hasDbgRecords());
-  EXPECT_TRUE(B.hasDbgRecords());
-  DbgVariableRecord *BDbgVal =
-      cast<DbgVariableRecord>(&*B.getDbgRecordRange().begin());
-  EXPECT_EQ(BDbgVal->getNumVariableLocationOps(), 1u);
-  EXPECT_EQ(ConstantInt::get(A.getType(), 0),
-            BDbgVal->getVariableLocationOp(0));
+  auto *ADbgVal = cast<DbgValueInst>(A.getNextNode());
+  EXPECT_EQ(ADbgVal->getNumVariableLocationOps(), 1u);
+  EXPECT_EQ(ConstantInt::get(A.getType(), 0), ADbgVal->getVariableLocationOp(0));
 
   // Introduce a use-before-def. Check that the dbg.values for %f become undef.
   EXPECT_TRUE(replaceAllDbgUsesWith(F_, G, G, DT));
 
-  DbgVariableRecord *BarrierDbgVal =
-      cast<DbgVariableRecord>(&*Barrier.getDbgRecordRange().begin());
-  EXPECT_EQ(BarrierDbgVal->getNumVariableLocationOps(), 1u);
-  EXPECT_TRUE(BarrierDbgVal->isKillLocation());
+  auto *FDbgVal = cast<DbgValueInst>(F_.getNextNode());
+  EXPECT_EQ(FDbgVal->getNumVariableLocationOps(), 1u);
+  EXPECT_TRUE(FDbgVal->isKillLocation());
 
-  SmallVector<DbgValueInst *, 1> BarrierDbgVals;
-  SmallVector<DbgVariableRecord *, 8> BarrierDbgRecs;
-  findDbgValues(BarrierDbgVals, &F_, &BarrierDbgRecs);
-  EXPECT_EQ(0U, BarrierDbgVals.size());
-  EXPECT_EQ(0U, BarrierDbgRecs.size());
+  SmallVector<DbgValueInst *, 1> FDbgVals;
+  findDbgValues(FDbgVals, &F_);
+  EXPECT_EQ(0U, FDbgVals.size());
 
   // Simulate i32 -> i64 conversion to test sign-extension. Here are some
   // interesting cases to handle:
@@ -961,15 +935,13 @@ TEST(Local, ReplaceAllDbgUsesWith) {
   //  4-6) like (1-3), but with a fragment
   EXPECT_TRUE(replaceAllDbgUsesWith(B, A, A, DT));
 
-  SmallVector<DbgValueInst *, 8> BDbgVals;
-  SmallVector<DbgVariableRecord *, 8> BDbgRecs;
-  findDbgValues(BDbgVals, &A, &BDbgRecs);
-  EXPECT_EQ(0U, BDbgVals.size());
-  EXPECT_EQ(6U, BDbgRecs.size());
+  SmallVector<DbgValueInst *, 8> ADbgVals;
+  findDbgValues(ADbgVals, &A);
+  EXPECT_EQ(6U, ADbgVals.size());
 
   // Check that %a has a dbg.value with a DIExpression matching \p Ops.
   auto hasADbgVal = [&](ArrayRef<uint64_t> Ops) {
-    return any_of(BDbgRecs, [&](DbgVariableRecord *DVI) {
+    return any_of(ADbgVals, [&](DbgValueInst *DVI) {
       assert(DVI->getVariable()->getName() == "2");
       return DVI->getExpression()->getElements() == Ops;
     });
@@ -1372,11 +1344,6 @@ TEST(Local, ExpressionForConstant) {
 
 TEST(Local, ReplaceDbgVariableRecord) {
   LLVMContext C;
-  // FIXME: PreserveInputDbgFormat is set to true because this test has
-  // been written to expect debug intrinsics rather than debug records; use the
-  // intrinsic format until we update the test checks.
-  auto SettingGuard = SaveDbgInfoFormat();
-  PreserveInputDbgFormat = cl::boolOrDefault::BOU_TRUE;
 
   // Test that RAUW also replaces the operands of DbgVariableRecord objects,
   // i.e. non-instruction stored debugging information.
diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index e16ca31b81a8d3..780a69f1f3299b 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -18,6 +18,7 @@ unittest("ClangAnalysisFlowSensitiveTests") {
     "//llvm/lib/Testing/Support",
   ]
   sources = [
+    "ASTOpsTest.cpp",
     "ArenaTest.cpp",
     "CFGMatchSwitchTest.cpp",
     "ChromiumCheckModelTest.cpp",
diff --git a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
index 48b7835ae5fca6..2d5e9d27c5bdfc 100644
--- a/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
+++ b/mlir/include/mlir/Conversion/GPUCommon/GPUCommonPass.h
@@ -46,9 +46,6 @@ class LLVMDialect;
 #define GEN_PASS_DECL_GPUTOLLVMCONVERSIONPASS
 #include "mlir/Conversion/Passes.h.inc"
 
-using OwnedBlob = std::unique_ptr<std::vector<char>>;
-using BlobGenerator =
-    std::function<OwnedBlob(const std::string &, Location, StringRef)>;
 using LoweringCallback = std::function<std::unique_ptr<llvm::Module>(
     Operation *, llvm::LLVMContext &, StringRef)>;
 
@@ -66,10 +63,9 @@ struct FunctionCallBuilder {
 
 /// Collect a set of patterns to convert from the GPU dialect to LLVM and
 /// populate converter for gpu types.
-void populateGpuToLLVMConversionPatterns(
-    LLVMTypeConverter &converter, RewritePatternSet &patterns,
-    StringRef gpuBinaryAnnotation = {}, bool kernelBarePtrCallConv = false,
-    SymbolTable *cachedModuleTable = nullptr);
+void populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
+                                         RewritePatternSet &patterns,
+                                         bool kernelBarePtrCallConv = false);
 
 /// A function that maps a MemorySpace enum to a target-specific integer value.
 using MemorySpaceMapping = std::function<unsigned(gpu::AddressSpace)>;
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index eb58f4adc31d36..db67d6a5ff1287 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -478,11 +478,7 @@ def GpuToLLVMConversionPass : Pass<"gpu-to-llvm", "ModuleOp"> {
            /*default=*/"false",
              "Use bare pointers to pass memref arguments to kernels. "
              "The kernel must use the same setting for this option."
-           >,
-    Option<"gpuBinaryAnnotation", "gpu-binary-annotation", "std::string",
-               /*default=*/"gpu::getDefaultGpuBinaryAnnotation()",
-               "Annotation attribute string for GPU binary"
-               >
+           >
   ];
 
   let dependentDialects = [
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 2d8add82383bef..2fda091e412aef 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -309,6 +309,11 @@ struct BufferizationOptions {
   /// bufferized or not.
   bool bufferizeFunctionBoundaries = false;
 
+  // Specifies whether to account for parallel regions in RaW analysis. If true,
+  // then writes inside of parallel regions that write to buffers defined
+  // outside of the parallel region will be given a new buffer.
+  bool checkParallelRegions = true;
+
   /// Certain ops have aliasing OpOperand/OpResult invariants (e.g., scf.for).
   /// If this flag is set to `false`, those invariants are no longer enforced
   /// with buffer copies.
diff --git a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
index 5ace9c390e146e..53b3b0505b3992 100644
--- a/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/TransformOps/BufferizationTransformOps.td
@@ -88,6 +88,7 @@ def OneShotBufferizeOp
       DefaultValuedAttr<BoolAttr, "false">:$dump_alias_sets,
       DefaultValuedAttr<BoolAttr, "false">:$test_analysis_only,
       DefaultValuedAttr<BoolAttr, "false">:$print_conflicts,
+      DefaultValuedAttr<BoolAttr, "true">:$check_parallel_regions,
       DefaultValuedAttr<StrAttr, "\"memref.copy\"">:$memcpy_op);
 
   let results = (outs TransformHandleTypeInterface:$transformed);
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index 8f8826b9ad56b4..1cece818dbbbc3 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -498,6 +498,8 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
     Option<"bufferizeFunctionBoundaries", "bufferize-function-boundaries",
            "bool", /*default=*/"0",
            "Bufferize function boundaries (experimental).">,
+    Option<"checkParallelRegions", "check-parallel-regions", "bool",
+           /*default=*/"true", "Account for parallel regions in RaW analysis.">,
     Option<"copyBeforeWrite", "copy-before-write", "bool", /*default=*/"false",
            "Skip the analysis. Make a buffer copy on every write.">,
     ListOption<"dialectFilter", "dialect-filter", "std::string",
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index d8150aeb828a59..6656be830989a4 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -40,31 +40,35 @@ Value computeAbs(Value real, Value imag, arith::FastMathFlags fmf,
 
   Value max = b.create<arith::MaximumFOp>(absReal, absImag, fmf);
   Value min = b.create<arith::MinimumFOp>(absReal, absImag, fmf);
-  Value ratio = b.create<arith::DivFOp>(min, max, fmf);
-  Value ratioSq = b.create<arith::MulFOp>(ratio, ratio, fmf);
-  Value ratioSqPlusOne = b.create<arith::AddFOp>(ratioSq, one, fmf);
+
+  // The lowering below requires NaNs and infinities to work correctly.
+  arith::FastMathFlags fmfWithNaNInf = arith::bitEnumClear(
+      fmf, arith::FastMathFlags::nnan | arith::FastMathFlags::ninf);
+  Value ratio = b.create<arith::DivFOp>(min, max, fmfWithNaNInf);
+  Value ratioSq = b.create<arith::MulFOp>(ratio, ratio, fmfWithNaNInf);
+  Value ratioSqPlusOne = b.create<arith::AddFOp>(ratioSq, one, fmfWithNaNInf);
   Value result;
 
   if (fn == AbsFn::rsqrt) {
-    ratioSqPlusOne = b.create<math::RsqrtOp>(ratioSqPlusOne, fmf);
-    min = b.create<math::RsqrtOp>(min, fmf);
-    max = b.create<math::RsqrtOp>(max, fmf);
+    ratioSqPlusOne = b.create<math::RsqrtOp>(ratioSqPlusOne, fmfWithNaNInf);
+    min = b.create<math::RsqrtOp>(min, fmfWithNaNInf);
+    max = b.create<math::RsqrtOp>(max, fmfWithNaNInf);
   }
 
   if (fn == AbsFn::sqrt) {
     Value quarter = b.create<arith::ConstantOp>(
         real.getType(), b.getFloatAttr(real.getType(), 0.25));
     // sqrt(sqrt(a*b)) would avoid the pow, but will overflow more easily.
-    Value sqrt = b.create<math::SqrtOp>(max, fmf);
-    Value p025 = b.create<math::PowFOp>(ratioSqPlusOne, quarter, fmf);
-    result = b.create<arith::MulFOp>(sqrt, p025, fmf);
+    Value sqrt = b.create<math::SqrtOp>(max, fmfWithNaNInf);
+    Value p025 = b.create<math::PowFOp>(ratioSqPlusOne, quarter, fmfWithNaNInf);
+    result = b.create<arith::MulFOp>(sqrt, p025, fmfWithNaNInf);
   } else {
-    Value sqrt = b.create<math::SqrtOp>(ratioSqPlusOne, fmf);
-    result = b.create<arith::MulFOp>(max, sqrt, fmf);
+    Value sqrt = b.create<math::SqrtOp>(ratioSqPlusOne, fmfWithNaNInf);
+    result = b.create<arith::MulFOp>(max, sqrt, fmfWithNaNInf);
   }
 
-  Value isNaN =
-      b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, result, result, fmf);
+  Value isNaN = b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, result,
+                                        result, fmfWithNaNInf);
   return b.create<arith::SelectOp>(isNaN, min, result);
 }
 
@@ -595,17 +599,20 @@ struct Log1pOpConversion : public OpConversionPattern<complex::Log1pOp> {
     Value maxMinusOne = b.create<arith::SubFOp>(maxAbs, one, fmf);
     Value maxAbsOfRealPlusOneAndImagMinusOne =
         b.create<arith::SelectOp>(useReal, real, maxMinusOne);
-    Value minMaxRatio = b.create<arith::DivFOp>(minAbs, maxAbs, fmf);
+    arith::FastMathFlags fmfWithNaNInf = arith::bitEnumClear(
+        fmf, arith::FastMathFlags::nnan | arith::FastMathFlags::ninf);
+    Value minMaxRatio = b.create<arith::DivFOp>(minAbs, maxAbs, fmfWithNaNInf);
     Value logOfMaxAbsOfRealPlusOneAndImag =
         b.create<math::Log1pOp>(maxAbsOfRealPlusOneAndImagMinusOne, fmf);
     Value logOfSqrtPart = b.create<math::Log1pOp>(
-        b.create<arith::MulFOp>(minMaxRatio, minMaxRatio, fmf), fmf);
+        b.create<arith::MulFOp>(minMaxRatio, minMaxRatio, fmfWithNaNInf),
+        fmfWithNaNInf);
     Value r = b.create<arith::AddFOp>(
-        b.create<arith::MulFOp>(half, logOfSqrtPart, fmf),
-        logOfMaxAbsOfRealPlusOneAndImag, fmf);
+        b.create<arith::MulFOp>(half, logOfSqrtPart, fmfWithNaNInf),
+        logOfMaxAbsOfRealPlusOneAndImag, fmfWithNaNInf);
     Value resultReal = b.create<arith::SelectOp>(
-        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, r, r, fmf), minAbs,
-        r);
+        b.create<arith::CmpFOp>(arith::CmpFPredicate::UNO, r, r, fmfWithNaNInf),
+        minAbs, r);
     Value resultImag = b.create<math::Atan2Op>(imag, realPlusOne, fmf);
     rewriter.replaceOpWithNewOp<complex::CreateOp>(op, type, resultReal,
                                                    resultImag);
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
index 82bfa9514a8841..92b28ff9c58737 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -49,8 +49,6 @@ namespace mlir {
 
 using namespace mlir;
 
-static constexpr const char *kGpuBinaryStorageSuffix = "_gpubin_cst";
-
 namespace {
 class GpuToLLVMConversionPass
     : public impl::GpuToLLVMConversionPassBase<GpuToLLVMConversionPass> {
@@ -97,36 +95,6 @@ class ConvertOpToGpuRuntimeCallPattern : public ConvertOpToLLVMPattern<OpTy> {
   Type llvmIntPtrType = IntegerType::get(
       context, this->getTypeConverter()->getPointerBitwidth(0));
 
-  FunctionCallBuilder moduleLoadCallBuilder = {
-      "mgpuModuleLoad",
-      llvmPointerType /* void *module */,
-      {llvmPointerType /* void *cubin */, llvmInt64Type /* size_t size */}};
-  FunctionCallBuilder moduleUnloadCallBuilder = {
-      "mgpuModuleUnload", llvmVoidType, {llvmPointerType /* void *module */}};
-  FunctionCallBuilder moduleGetFunctionCallBuilder = {
-      "mgpuModuleGetFunction",
-      llvmPointerType /* void *function */,
-      {
-          llvmPointerType, /* void *module */
-          llvmPointerType  /* char *name   */
-      }};
-  FunctionCallBuilder launchKernelCallBuilder = {
-      "mgpuLaunchKernel",
-      llvmVoidType,
-      {
-          llvmPointerType, /* void* f */
-          llvmIntPtrType,  /* intptr_t gridXDim */
-          llvmIntPtrType,  /* intptr_t gridyDim */
-          llvmIntPtrType,  /* intptr_t gridZDim */
-          llvmIntPtrType,  /* intptr_t blockXDim */
-          llvmIntPtrType,  /* intptr_t blockYDim */
-          llvmIntPtrType,  /* intptr_t blockZDim */
-          llvmInt32Type,   /* unsigned int sharedMemBytes */
-          llvmPointerType, /* void *hstream */
-          llvmPointerType, /* void **kernelParams */
-          llvmPointerType, /* void **extra */
-          llvmInt64Type    /* size_t paramsCount */
-      }};
   FunctionCallBuilder streamCreateCallBuilder = {
       "mgpuStreamCreate", llvmPointerType /* void *stream */, {}};
   FunctionCallBuilder streamDestroyCallBuilder = {
@@ -451,55 +419,21 @@ class ConvertWaitAsyncOpToGpuRuntimeCallPattern
                   ConversionPatternRewriter &rewriter) const override;
 };
 
-/// A rewrite patter to convert gpu.launch_func operations into a sequence of
-/// GPU runtime calls. Currently it supports CUDA and ROCm (HIP).
-///
-/// In essence, a gpu.launch_func operations gets compiled into the following
-/// sequence of runtime calls:
-///
-/// * moduleLoad        -- loads the module given the cubin / hsaco data
-/// * moduleGetFunction -- gets a handle to the actual kernel function
-/// * getStreamHelper   -- initializes a new compute stream on GPU
-/// * launchKernel      -- launches the kernel on a stream
-/// * streamSynchronize -- waits for operations on the stream to finish
-///
-/// Intermediate data structures are allocated on the stack.
-class ConvertLaunchFuncOpToGpuRuntimeCallPattern
+/// A rewrite patter to legalize gpu.launch_func with LLVM types.
+class LegalizeLaunchFuncOpPattern
     : public ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp> {
 public:
-  ConvertLaunchFuncOpToGpuRuntimeCallPattern(
-      const LLVMTypeConverter &typeConverter, StringRef gpuBinaryAnnotation,
-      bool kernelBarePtrCallConv, SymbolTable *cachedModuleTable)
+  LegalizeLaunchFuncOpPattern(const LLVMTypeConverter &typeConverter,
+                              bool kernelBarePtrCallConv)
       : ConvertOpToGpuRuntimeCallPattern<gpu::LaunchFuncOp>(typeConverter),
-        gpuBinaryAnnotation(gpuBinaryAnnotation),
-        kernelBarePtrCallConv(kernelBarePtrCallConv),
-        cachedModuleTable(cachedModuleTable) {}
+        kernelBarePtrCallConv(kernelBarePtrCallConv) {}
 
 private:
-  Value generateParamsArray(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
-                            OpBuilder &builder) const;
-  Value generateKernelNameConstant(StringRef moduleName, StringRef name,
-                                   Location loc, OpBuilder &builder) const;
-
   LogicalResult
   matchAndRewrite(gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override;
 
-  llvm::SmallString<32> gpuBinaryAnnotation;
   bool kernelBarePtrCallConv;
-  SymbolTable *cachedModuleTable;
-};
-
-class EraseGpuModuleOpPattern : public OpRewritePattern<gpu::GPUModuleOp> {
-  using OpRewritePattern<gpu::GPUModuleOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(gpu::GPUModuleOp op,
-                                PatternRewriter &rewriter) const override {
-    // GPU kernel modules are no longer necessary since we have a global
-    // constant with the CUBIN, or HSACO data.
-    rewriter.eraseOp(op);
-    return success();
-  }
 };
 
 /// A rewrite pattern to convert gpu.memcpy operations into a GPU runtime
@@ -587,7 +521,6 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SetCsrPointersOp)
 
 void GpuToLLVMConversionPass::runOnOperation() {
   MLIRContext *context = &getContext();
-  SymbolTable symbolTable = SymbolTable(getOperation());
   LowerToLLVMOptions options(context);
   options.useBarePtrCallConv = hostBarePtrCallConv;
   RewritePatternSet patterns(context);
@@ -604,30 +537,20 @@ void GpuToLLVMConversionPass::runOnOperation() {
     iface->populateConvertToLLVMConversionPatterns(target, converter, patterns);
   }
 
-  // Preserve GPU modules if they have target attributes.
-  target.addDynamicallyLegalOp<gpu::GPUModuleOp>(
-      [](gpu::GPUModuleOp module) -> bool {
-        return module.getTargetsAttr() != nullptr;
-      });
-  // Accept as legal LaunchFuncOps if they refer to GPU Modules with targets and
-  // the operands have been lowered.
+  // Preserve GPU modules and binaries. Modules are preserved as they can be
+  // converted later by `gpu-module-to-binary`.
+  target.addLegalOp<gpu::GPUModuleOp, gpu::BinaryOp>();
+  // Accept as legal LaunchFuncOps if the operands have been lowered.
   target.addDynamicallyLegalOp<gpu::LaunchFuncOp>(
-      [&](gpu::LaunchFuncOp op) -> bool {
-        auto module =
-            symbolTable.lookup<gpu::GPUModuleOp>(op.getKernelModuleName());
-        return converter.isLegal(op->getOperandTypes()) &&
-               converter.isLegal(op->getResultTypes()) &&
-               (module && module.getTargetsAttr() &&
-                !module.getTargetsAttr().empty());
-      });
+      [&](gpu::LaunchFuncOp op) -> bool { return converter.isLegal(op); });
 
   // These aren't covered by the ConvertToLLVMPatternInterface right now.
   populateVectorToLLVMConversionPatterns(converter, patterns);
   populateFinalizeMemRefToLLVMConversionPatterns(converter, patterns);
   populateAsyncStructuralTypeConversionsAndLegality(converter, patterns,
                                                     target);
-  populateGpuToLLVMConversionPatterns(converter, patterns, gpuBinaryAnnotation,
-                                      kernelBarePtrCallConv, &symbolTable);
+  populateGpuToLLVMConversionPatterns(converter, patterns,
+                                      kernelBarePtrCallConv);
 
   if (failed(
           applyPartialConversion(getOperation(), target, std::move(patterns))))
@@ -1002,100 +925,8 @@ LogicalResult ConvertWaitAsyncOpToGpuRuntimeCallPattern::matchAndRewrite(
   return success();
 }
 
-// Creates a struct containing all kernel parameters on the stack and returns
-// an array of type-erased pointers to the fields of the struct. The array can
-// then be passed to the CUDA / ROCm (HIP) kernel launch calls.
-// The generated code is essentially as follows:
-//
-// %struct = alloca(sizeof(struct { Parameters... }))
-// %array = alloca(NumParameters * sizeof(void *))
-// for (i : [0, NumParameters))
-//   %fieldPtr = llvm.getelementptr %struct[0, i]
-//   llvm.store parameters[i], %fieldPtr
-//   %elementPtr = llvm.getelementptr %array[i]
-//   llvm.store %fieldPtr, %elementPtr
-// return %array
-Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateParamsArray(
-    gpu::LaunchFuncOp launchOp, OpAdaptor adaptor, OpBuilder &builder) const {
-  auto loc = launchOp.getLoc();
-  auto numKernelOperands = launchOp.getNumKernelOperands();
-  // Note: If `useBarePtrCallConv` is set in the type converter's options,
-  // the value of `kernelBarePtrCallConv` will be ignored.
-  SmallVector<Value, 4> arguments = getTypeConverter()->promoteOperands(
-      loc, launchOp.getOperands().take_back(numKernelOperands),
-      adaptor.getOperands().take_back(numKernelOperands), builder,
-      /*useBarePtrCallConv=*/kernelBarePtrCallConv);
-  auto numArguments = arguments.size();
-  SmallVector<Type, 4> argumentTypes;
-  argumentTypes.reserve(numArguments);
-  for (auto argument : arguments)
-    argumentTypes.push_back(argument.getType());
-  auto structType = LLVM::LLVMStructType::getNewIdentified(context, StringRef(),
-                                                           argumentTypes);
-  auto one = builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, 1);
-  auto structPtr =
-      builder.create<LLVM::AllocaOp>(loc, llvmPointerType, structType, one,
-                                     /*alignment=*/0);
-  auto arraySize =
-      builder.create<LLVM::ConstantOp>(loc, llvmInt32Type, numArguments);
-  auto arrayPtr = builder.create<LLVM::AllocaOp>(
-      loc, llvmPointerType, llvmPointerType, arraySize, /*alignment=*/0);
-  for (const auto &en : llvm::enumerate(arguments)) {
-    const auto index = static_cast<int32_t>(en.index());
-    Value fieldPtr =
-        builder.create<LLVM::GEPOp>(loc, llvmPointerType, structType, structPtr,
-                                    ArrayRef<LLVM::GEPArg>{0, index});
-    builder.create<LLVM::StoreOp>(loc, en.value(), fieldPtr);
-    auto elementPtr =
-        builder.create<LLVM::GEPOp>(loc, llvmPointerType, llvmPointerType,
-                                    arrayPtr, ArrayRef<LLVM::GEPArg>{index});
-    builder.create<LLVM::StoreOp>(loc, fieldPtr, elementPtr);
-  }
-  return arrayPtr;
-}
-
-// Generates an LLVM IR dialect global that contains the name of the given
-// kernel function as a C string, and returns a pointer to its beginning.
-// The code is essentially:
-//
-// llvm.global constant @kernel_name("function_name\00")
-// func(...) {
-//   %0 = llvm.addressof @kernel_name
-//   %1 = llvm.constant (0 : index)
-//   %2 = llvm.getelementptr %0[%1, %1] : !llvm<"i8*">
-// }
-Value ConvertLaunchFuncOpToGpuRuntimeCallPattern::generateKernelNameConstant(
-    StringRef moduleName, StringRef name, Location loc,
-    OpBuilder &builder) const {
-  // Make sure the trailing zero is included in the constant.
-  std::vector<char> kernelName(name.begin(), name.end());
-  kernelName.push_back('\0');
-
-  std::string globalName =
-      std::string(llvm::formatv("{0}_{1}_kernel_name", moduleName, name));
-  return LLVM::createGlobalString(
-      loc, builder, globalName, StringRef(kernelName.data(), kernelName.size()),
-      LLVM::Linkage::Internal);
-}
-
-// Emits LLVM IR to launch a kernel function. Expects the module that contains
-// the compiled kernel function as a cubin in the 'nvvm.cubin' attribute, or a
-// hsaco in the 'rocdl.hsaco' attribute of the kernel function in the IR.
-//
-// %0 = call %binarygetter
-// %1 = call %moduleLoad(%0)
-// %2 = <see generateKernelNameConstant>
-// %3 = call %moduleGetFunction(%1, %2)
-// %4 = call %streamCreate()
-// %5 = <see generateParamsArray>
-// call %launchKernel(%3, <launchOp operands 0..5>, 0, %4, %5, nullptr)
-// call %streamSynchronize(%4)
-// call %streamDestroy(%4)
-// call %moduleUnload(%1)
-//
-// If the op is async, the stream corresponds to the (single) async dependency
-// as well as the async token the op produces.
-LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
+// Legalize the op's operands.
+LogicalResult LegalizeLaunchFuncOpPattern::matchAndRewrite(
     gpu::LaunchFuncOp launchOp, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
   if (failed(areAllLLVMTypes(launchOp, adaptor.getOperands(), rewriter)))
@@ -1114,123 +945,37 @@ LogicalResult ConvertLaunchFuncOpToGpuRuntimeCallPattern::matchAndRewrite(
 
   Location loc = launchOp.getLoc();
 
-  // Create an LLVM global with CUBIN extracted from the kernel annotation and
-  // obtain a pointer to the first byte in it.
-  gpu::GPUModuleOp kernelModule;
-  if (cachedModuleTable)
-    kernelModule = cachedModuleTable->lookup<gpu::GPUModuleOp>(
-        launchOp.getKernelModuleName());
-  else
-    kernelModule = SymbolTable::lookupNearestSymbolFrom<gpu::GPUModuleOp>(
-        launchOp, launchOp.getKernelModuleName());
-  assert(kernelModule && "expected a kernel module");
-
-  // If the module has Targets then just update the op operands.
-  if (ArrayAttr targets = kernelModule.getTargetsAttr()) {
-    Value stream = Value();
-    if (!adaptor.getAsyncDependencies().empty())
-      stream = adaptor.getAsyncDependencies().front();
-    // If the async keyword is present and there are no dependencies, then a
-    // stream must be created to pass to subsequent operations.
-    else if (launchOp.getAsyncToken())
-      stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult();
-
-    // Lower the kernel operands to match kernel parameters.
-    // Note: If `useBarePtrCallConv` is set in the type converter's options,
-    // the value of `kernelBarePtrCallConv` will be ignored.
-    SmallVector<Value, 4> arguments = getTypeConverter()->promoteOperands(
-        loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(),
-        rewriter, /*useBarePtrCallConv=*/kernelBarePtrCallConv);
-
-    std::optional<gpu::KernelDim3> clusterSize = std::nullopt;
-    if (launchOp.hasClusterSize()) {
-      clusterSize =
-          gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(),
-                          adaptor.getClusterSizeZ()};
-    }
-    rewriter.create<gpu::LaunchFuncOp>(
-        launchOp.getLoc(), launchOp.getKernelAttr(),
-        gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(),
-                        adaptor.getGridSizeZ()},
-        gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
-                        adaptor.getBlockSizeZ()},
-        adaptor.getDynamicSharedMemorySize(), arguments, stream, clusterSize);
-    if (launchOp.getAsyncToken())
-      rewriter.replaceOp(launchOp, {stream});
-    else
-      rewriter.eraseOp(launchOp);
-    return success();
-  }
+  Value stream = Value();
+  if (!adaptor.getAsyncDependencies().empty())
+    stream = adaptor.getAsyncDependencies().front();
+  // If the async keyword is present and there are no dependencies, then a
+  // stream must be created to pass to subsequent operations.
+  else if (launchOp.getAsyncToken())
+    stream = streamCreateCallBuilder.create(loc, rewriter, {}).getResult();
+  // Lower the kernel operands to match kernel parameters.
+  // Note: If `useBarePtrCallConv` is set in the type converter's options,
+  // the value of `kernelBarePtrCallConv` will be ignored.
+  SmallVector<Value, 4> arguments = getTypeConverter()->promoteOperands(
+      loc, launchOp.getKernelOperands(), adaptor.getKernelOperands(), rewriter,
+      /*useBarePtrCallConv=*/kernelBarePtrCallConv);
 
-  auto binaryAttr =
-      kernelModule->getAttrOfType<StringAttr>(gpuBinaryAnnotation);
-  if (!binaryAttr) {
-    kernelModule.emitOpError()
-        << "missing " << gpuBinaryAnnotation << " attribute";
-    return failure();
+  std::optional<gpu::KernelDim3> clusterSize = std::nullopt;
+  if (launchOp.hasClusterSize()) {
+    clusterSize =
+        gpu::KernelDim3{adaptor.getClusterSizeX(), adaptor.getClusterSizeY(),
+                        adaptor.getClusterSizeZ()};
   }
-
-  SmallString<128> nameBuffer(kernelModule.getName());
-  nameBuffer.append(kGpuBinaryStorageSuffix);
-  Value data =
-      LLVM::createGlobalString(loc, rewriter, nameBuffer.str(),
-                               binaryAttr.getValue(), LLVM::Linkage::Internal);
-
-  // Pass the binary size. SPIRV requires binary size.
-  auto gpuBlob = binaryAttr.getValue();
-  auto gpuBlobSize = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, llvmInt64Type,
-      mlir::IntegerAttr::get(llvmInt64Type,
-                             static_cast<int64_t>(gpuBlob.size())));
-
-  auto module =
-      moduleLoadCallBuilder.create(loc, rewriter, {data, gpuBlobSize});
-
-  // Pass the count of the parameters to runtime wrappers
-  auto paramsCount = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, llvmInt64Type,
-      mlir::IntegerAttr::get(
-          llvmInt64Type,
-          static_cast<int64_t>(launchOp.getNumKernelOperands())));
-
-  // Get the function from the module. The name corresponds to the name of
-  // the kernel function.
-  auto kernelName = generateKernelNameConstant(
-      launchOp.getKernelModuleName().getValue(),
-      launchOp.getKernelName().getValue(), loc, rewriter);
-  auto function = moduleGetFunctionCallBuilder.create(
-      loc, rewriter, {module.getResult(), kernelName});
-  Value zero = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type, 0);
-  Value stream =
-      adaptor.getAsyncDependencies().empty()
-          ? streamCreateCallBuilder.create(loc, rewriter, {}).getResult()
-          : adaptor.getAsyncDependencies().front();
-  // Create array of pointers to kernel arguments.
-  auto kernelParams = generateParamsArray(launchOp, adaptor, rewriter);
-  auto nullpointer = rewriter.create<LLVM::ZeroOp>(loc, llvmPointerType);
-  Value dynamicSharedMemorySize = launchOp.getDynamicSharedMemorySize()
-                                      ? launchOp.getDynamicSharedMemorySize()
-                                      : zero;
-  launchKernelCallBuilder.create(
-      loc, rewriter,
-      {function.getResult(), adaptor.getGridSizeX(), adaptor.getGridSizeY(),
-       adaptor.getGridSizeZ(), adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
-       adaptor.getBlockSizeZ(), dynamicSharedMemorySize, stream, kernelParams,
-       /*extra=*/nullpointer, paramsCount});
-
-  if (launchOp.getAsyncToken()) {
-    // Async launch: make dependent ops use the same stream.
+  rewriter.create<gpu::LaunchFuncOp>(
+      launchOp.getLoc(), launchOp.getKernelAttr(),
+      gpu::KernelDim3{adaptor.getGridSizeX(), adaptor.getGridSizeY(),
+                      adaptor.getGridSizeZ()},
+      gpu::KernelDim3{adaptor.getBlockSizeX(), adaptor.getBlockSizeY(),
+                      adaptor.getBlockSizeZ()},
+      adaptor.getDynamicSharedMemorySize(), arguments, stream, clusterSize);
+  if (launchOp.getAsyncToken())
     rewriter.replaceOp(launchOp, {stream});
-  } else {
-    // Synchronize with host and destroy stream. This must be the stream created
-    // above (with no other uses) because we check that the synchronous version
-    // does not have any async dependencies.
-    streamSynchronizeCallBuilder.create(loc, rewriter, stream);
-    streamDestroyCallBuilder.create(loc, rewriter, stream);
+  else
     rewriter.eraseOp(launchOp);
-  }
-  moduleUnloadCallBuilder.create(loc, rewriter, module.getResult());
-
   return success();
 }
 
@@ -1978,9 +1723,7 @@ LogicalResult ConvertCreateBsrOpToGpuRuntimeCallPattern::matchAndRewrite(
 
 void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns,
-                                               StringRef gpuBinaryAnnotation,
-                                               bool kernelBarePtrCallConv,
-                                               SymbolTable *cachedModuleTable) {
+                                               bool kernelBarePtrCallConv) {
   addOpaquePointerConversion<gpu::AsyncTokenType>(converter);
   addOpaquePointerConversion<gpu::SparseDnTensorHandleType>(converter);
   addOpaquePointerConversion<gpu::SparseSpMatHandleType>(converter);
@@ -2017,7 +1760,5 @@ void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter,
                ConvertSpGEMMCopyOpToGpuRuntimeCallPattern,
                ConvertSpMatGetSizeOpToGpuRuntimeCallPattern,
                ConvertSetCsrPointersOpToGpuRuntimeCallPattern>(converter);
-  patterns.add<ConvertLaunchFuncOpToGpuRuntimeCallPattern>(
-      converter, gpuBinaryAnnotation, kernelBarePtrCallConv, cachedModuleTable);
-  patterns.add<EraseGpuModuleOpPattern>(&converter.getContext());
+  patterns.add<LegalizeLaunchFuncOpPattern>(converter, kernelBarePtrCallConv);
 }
diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 5797c5681a5fdd..2f6647a2a27b15 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -2544,7 +2544,6 @@ std::optional<TypedAttr> mlir::arith::getNeutralElement(Operation *op) {
           .Case([](arith::MulIOp op) { return AtomicRMWKind::muli; })
           .Default([](Operation *op) { return std::nullopt; });
   if (!maybeKind) {
-    op->emitError() << "Unknown neutral element for: " << *op;
     return std::nullopt;
   }
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 0fddd60eb8140e..e422086c9fde6d 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -226,6 +226,7 @@ struct OneShotBufferizePass
       opt.printConflicts = printConflicts;
       opt.testAnalysisOnly = testAnalysisOnly;
       opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
+      opt.checkParallelRegions = checkParallelRegions;
       opt.noAnalysisFuncFilter = noAnalysisFuncFilter;
 
       // Configure type converter.
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
index 2d329a1f3d8892..d0b4e0dd4383ea 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp
@@ -611,7 +611,7 @@ hasReadAfterWriteInterference(const DenseSet<OpOperand *> &usesRead,
   // Before going through the main RaW analysis, find cases where a buffer must
   // be privatized due to parallelism. If the result of a write is never read,
   // privatization is not necessary (and large parts of the IR are likely dead).
-  if (!usesRead.empty()) {
+  if (options.checkParallelRegions && !usesRead.empty()) {
     for (OpOperand *uConflictingWrite : usesWrite) {
       // Find the allocation point or last write (definition) of the buffer.
       // Note: In contrast to `findDefinitions`, this also returns results of
diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp
index 6a362afc52f259..2c43a6f15aa83f 100644
--- a/mlir/lib/IR/AsmPrinter.cpp
+++ b/mlir/lib/IR/AsmPrinter.cpp
@@ -999,9 +999,13 @@ static StringRef sanitizeIdentifier(StringRef name, SmallString<16> &buffer,
                                     bool allowTrailingDigit = true) {
   assert(!name.empty() && "Shouldn't have an empty name here");
 
+  auto validChar = [&](char ch) {
+    return llvm::isAlnum(ch) || allowedPunctChars.contains(ch);
+  };
+
   auto copyNameToBuffer = [&] {
     for (char ch : name) {
-      if (llvm::isAlnum(ch) || allowedPunctChars.contains(ch))
+      if (validChar(ch))
         buffer.push_back(ch);
       else if (ch == ' ')
         buffer.push_back('_');
@@ -1013,7 +1017,7 @@ static StringRef sanitizeIdentifier(StringRef name, SmallString<16> &buffer,
   // Check to see if this name is valid. If it starts with a digit, then it
   // could conflict with the autogenerated numeric ID's, so add an underscore
   // prefix to avoid problems.
-  if (isdigit(name[0])) {
+  if (isdigit(name[0]) || (!validChar(name[0]) && name[0] != ' ')) {
     buffer.push_back('_');
     copyNameToBuffer();
     return buffer;
@@ -1029,7 +1033,7 @@ static StringRef sanitizeIdentifier(StringRef name, SmallString<16> &buffer,
 
   // Check to see that the name consists of only valid identifier characters.
   for (char ch : name) {
-    if (!llvm::isAlnum(ch) && !allowedPunctChars.contains(ch)) {
+    if (!validChar(ch)) {
       copyNameToBuffer();
       return buffer;
     }
diff --git a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
index 586bf7f8e93fba..1e1b2a8348b1d7 100644
--- a/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
+++ b/mlir/python/mlir/_mlir_libs/_mlir/ir.pyi
@@ -479,7 +479,7 @@ class AffineExpr:
 
 class Attribute:
     @staticmethod
-    def parse(asm: str, context: Optional[Context] = None) -> Attribute:
+    def parse(asm: str | bytes, context: Optional[Context] = None) -> Attribute:
         """
         Parses an attribute from an assembly form. Raises an MLIRError on failure.
         """
@@ -520,7 +520,7 @@ class Attribute:
 
 class Type:
     @staticmethod
-    def parse(asm: str, context: Optional[Context] = None) -> Type:
+    def parse(asm: str | bytes, context: Optional[Context] = None) -> Type:
         """
         Parses the assembly form of a type.
 
@@ -741,7 +741,7 @@ class AffineMap:
     def results(self) -> "AffineMapExprList": ...
 
 class AffineMapAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(affine_map: AffineMap) -> AffineMapAttr:
         """
@@ -779,7 +779,7 @@ class AffineSymbolExpr(AffineExpr):
     def position(self) -> int: ...
 
 class ArrayAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(attributes: List, context: Optional[Context] = None) -> ArrayAttr:
         """
@@ -823,7 +823,7 @@ class AttrBuilder:
         """
 
 class BF16Type(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> BF16Type:
         """
@@ -909,6 +909,11 @@ class BlockArgument(Value):
     def owner(self) -> Block: ...
 
 class BlockArgumentList:
+    @overload
+    def __getitem__(self, arg0: int) -> BlockArgument: ...
+    @overload
+    def __getitem__(self, arg0: slice) -> BlockArgumentList: ...
+    def __len__(self) -> int: ...
     def __add__(self, arg0: BlockArgumentList) -> List[BlockArgument]: ...
     @property
     def types(self) -> List[Type]: ...
@@ -955,7 +960,7 @@ class BoolAttr(Attribute):
         """
 
 class ComplexType(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(arg0: Type) -> ComplexType:
         """
@@ -1016,7 +1021,7 @@ class Context:
 class DenseBoolArrayAttr(Attribute):
     @staticmethod
     def get(
-        values: List[bool], context: Optional[Context] = None
+        values: Sequence[bool], context: Optional[Context] = None
     ) -> DenseBoolArrayAttr:
         """
         Gets a uniqued dense array attribute
@@ -1113,7 +1118,7 @@ class DenseElementsAttr(Attribute):
 class DenseF32ArrayAttr(Attribute):
     @staticmethod
     def get(
-        values: List[float], context: Optional[Context] = None
+        values: Sequence[float], context: Optional[Context] = None
     ) -> DenseF32ArrayAttr:
         """
         Gets a uniqued dense array attribute
@@ -1141,7 +1146,7 @@ class DenseF32ArrayIterator:
 class DenseF64ArrayAttr(Attribute):
     @staticmethod
     def get(
-        values: List[float], context: Optional[Context] = None
+        values: Sequence[float], context: Optional[Context] = None
     ) -> DenseF64ArrayAttr:
         """
         Gets a uniqued dense array attribute
@@ -1167,6 +1172,14 @@ class DenseF64ArrayIterator:
     def __next__(self) -> float: ...
 
 class DenseFPElementsAttr(DenseElementsAttr):
+    @staticmethod
+    def get(
+        array: Buffer,
+        signless: bool = True,
+        type: Optional[Type] = None,
+        shape: Optional[List[int]] = None,
+        context: Optional[Context] = None,
+    ) -> DenseFPElementsAttr: ...
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __getitem__(self, arg0: int) -> float: ...
@@ -1180,7 +1193,7 @@ class DenseFPElementsAttr(DenseElementsAttr):
 
 class DenseI16ArrayAttr(Attribute):
     @staticmethod
-    def get(values: List[int], context: Optional[Context] = None) -> DenseI16ArrayAttr:
+    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI16ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
@@ -1206,7 +1219,7 @@ class DenseI16ArrayIterator:
 
 class DenseI32ArrayAttr(Attribute):
     @staticmethod
-    def get(values: List[int], context: Optional[Context] = None) -> DenseI32ArrayAttr:
+    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI32ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
@@ -1232,7 +1245,7 @@ class DenseI32ArrayIterator:
 
 class DenseI64ArrayAttr(Attribute):
     @staticmethod
-    def get(values: List[int], context: Optional[Context] = None) -> DenseI64ArrayAttr:
+    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI64ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
@@ -1258,7 +1271,7 @@ class DenseI64ArrayIterator:
 
 class DenseI8ArrayAttr(Attribute):
     @staticmethod
-    def get(values: List[int], context: Optional[Context] = None) -> DenseI8ArrayAttr:
+    def get(values: Sequence[int], context: Optional[Context] = None) -> DenseI8ArrayAttr:
         """
         Gets a uniqued dense array attribute
         """
@@ -1283,6 +1296,14 @@ class DenseI8ArrayIterator:
     def __next__(self) -> int: ...
 
 class DenseIntElementsAttr(DenseElementsAttr):
+    @staticmethod
+    def get(
+        array: Buffer,
+        signless: bool = True,
+        type: Optional[Type] = None,
+        shape: Optional[List[int]] = None,
+        context: Optional[Context] = None,
+    ) -> DenseIntElementsAttr: ...
     @staticmethod
     def isinstance(other: Attribute) -> bool: ...
     def __getitem__(self, arg0: int) -> int: ...
@@ -1422,7 +1443,7 @@ class Dialects:
     def __getitem__(self, arg0: str) -> Dialect: ...
 
 class DictAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(value: Dict = {}, context: Optional[Context] = None) -> DictAttr:
         """
@@ -1453,7 +1474,7 @@ class FloatType(Type):
         """
 
 class F16Type(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> F16Type:
         """
@@ -1466,7 +1487,7 @@ class F16Type(FloatType):
     def typeid(self) -> TypeID: ...
 
 class F32Type(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> F32Type:
         """
@@ -1479,7 +1500,7 @@ class F32Type(FloatType):
     def typeid(self) -> TypeID: ...
 
 class F64Type(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> F64Type:
         """
@@ -1513,7 +1534,7 @@ class FlatSymbolRefAttr(Attribute):
         """
 
 class Float8E4M3B11FNUZType(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> Float8E4M3B11FNUZType:
         """
@@ -1526,7 +1547,7 @@ class Float8E4M3B11FNUZType(FloatType):
     def typeid(self) -> TypeID: ...
 
 class Float8E4M3FNType(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> Float8E4M3FNType:
         """
@@ -1539,7 +1560,7 @@ class Float8E4M3FNType(FloatType):
     def typeid(self) -> TypeID: ...
 
 class Float8E4M3FNUZType(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> Float8E4M3FNUZType:
         """
@@ -1552,7 +1573,7 @@ class Float8E4M3FNUZType(FloatType):
     def typeid(self) -> TypeID: ...
 
 class Float8E5M2FNUZType(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> Float8E5M2FNUZType:
         """
@@ -1565,7 +1586,7 @@ class Float8E5M2FNUZType(FloatType):
     def typeid(self) -> TypeID: ...
 
 class Float8E5M2Type(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> Float8E5M2Type:
         """
@@ -1578,7 +1599,7 @@ class Float8E5M2Type(FloatType):
     def typeid(self) -> TypeID: ...
 
 class FloatAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(type: Type, value: float, loc: Optional[Location] = None) -> FloatAttr:
         """
@@ -1612,7 +1633,7 @@ class FloatAttr(Attribute):
         """
 
 class FloatTF32Type(FloatType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> FloatTF32Type:
         """
@@ -1625,7 +1646,7 @@ class FloatTF32Type(FloatType):
     def typeid(self) -> TypeID: ...
 
 class FunctionType(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         inputs: List[Type], results: List[Type], context: Optional[Context] = None
@@ -1650,7 +1671,7 @@ class FunctionType(Type):
     def typeid(self) -> TypeID: ...
 
 class IndexType(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> IndexType:
         """
@@ -1766,7 +1787,7 @@ class InsertionPoint:
         """
 
 class IntegerAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(type: Type, value: int) -> IntegerAttr:
         """
@@ -1855,7 +1876,7 @@ class IntegerSetConstraintList:
     def __len__(self) -> int: ...
 
 class IntegerType(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get_signed(width: int, context: Optional[Context] = None) -> IntegerType:
         """
@@ -1967,7 +1988,7 @@ class Location:
         """
 
 class MemRefType(ShapedType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         shape: List[int],
@@ -2007,7 +2028,7 @@ class Module:
         Creates an empty module
         """
     @staticmethod
-    def parse(asm: str, context: Optional[Context] = None) -> Module:
+    def parse(asm: str | bytes, context: Optional[Context] = None) -> Module:
         """
         Parses a module's assembly format from a string.
 
@@ -2064,7 +2085,7 @@ class NamedAttribute:
         """
 
 class NoneType(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> NoneType:
         """
@@ -2130,7 +2151,12 @@ class OpResultList:
 
 class OpSuccessors:
     def __add__(self, arg0: OpSuccessors) -> List[Block]: ...
+    @overload
+    def __getitem__(self, arg0: int) -> Block: ...
+    @overload
+    def __getitem__(self, arg0: slice) -> OpSuccessors: ...
     def __setitem__(self, arg0: int, arg1: Block) -> None: ...
+    def __len__(self) -> int: ...
 
 class OpView(_OperationBase):
     _ODS_OPERAND_SEGMENTS: ClassVar[None] = ...
@@ -2154,7 +2180,7 @@ class OpView(_OperationBase):
     @classmethod
     def parse(
         cls: _Type[_TOperation],
-        source: str,
+        source: str | bytes,
         *,
         source_name: str = "",
         context: Optional[Context] = None,
@@ -2174,7 +2200,7 @@ class OpView(_OperationBase):
         """
 
 class OpaqueAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         dialect_namespace: str,
@@ -2204,7 +2230,7 @@ class OpaqueAttr(Attribute):
     def typeid(self) -> TypeID: ...
 
 class OpaqueType(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         dialect_namespace: str, buffer: str, context: Optional[Context] = None
@@ -2262,7 +2288,7 @@ class Operation(_OperationBase):
         """
     @staticmethod
     def parse(
-        source: str, *, source_name: str = "", context: Optional[Context] = None
+        source: str | bytes, *, source_name: str = "", context: Optional[Context] = None
     ) -> Operation:
         """
         Parses an operation. Supports both text assembly format and binary bytecode format.
@@ -2290,7 +2316,7 @@ class OperationList:
     def __len__(self) -> int: ...
 
 class RankedTensorType(ShapedType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         shape: List[int],
@@ -2443,7 +2469,7 @@ class ShapedTypeComponents:
         """
 
 class StridedLayoutAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         offset: int, strides: List[int], context: Optional[Context] = None
@@ -2477,9 +2503,9 @@ class StridedLayoutAttr(Attribute):
     def typeid(self) -> TypeID: ...
 
 class StringAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get(value: str, context: Optional[Context] = None) -> StringAttr:
+    def get(value: str | bytes, context: Optional[Context] = None) -> StringAttr:
         """
         Gets a uniqued string attribute
         """
@@ -2554,9 +2580,9 @@ class SymbolTable:
     def insert(self, operation: _OperationBase) -> Attribute: ...
 
 class TupleType(Type):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
-    def get_Tuple(elements: List[Type], context: Optional[Context] = None) -> TupleType:
+    def get_tuple(elements: List[Type], context: Optional[Context] = None) -> TupleType:
         """
         Create a Tuple type
         """
@@ -2576,7 +2602,7 @@ class TupleType(Type):
     def typeid(self) -> TypeID: ...
 
 class TypeAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(value: Type, context: Optional[Context] = None) -> TypeAttr:
         """
@@ -2603,7 +2629,7 @@ class TypeID:
     def _CAPIPtr(self) -> object: ...
 
 class UnitAttr(Attribute):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(context: Optional[Context] = None) -> UnitAttr:
         """
@@ -2618,7 +2644,7 @@ class UnitAttr(Attribute):
     def typeid(self) -> TypeID: ...
 
 class UnrankedMemRefType(ShapedType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         element_type: Type, memory_space: Attribute, loc: Optional[Location] = None
@@ -2638,7 +2664,7 @@ class UnrankedMemRefType(ShapedType):
     def typeid(self) -> TypeID: ...
 
 class UnrankedTensorType(ShapedType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(element_type: Type, loc: Optional[Location] = None) -> UnrankedTensorType:
         """
@@ -2651,7 +2677,7 @@ class UnrankedTensorType(ShapedType):
     def typeid(self) -> TypeID: ...
 
 class VectorType(ShapedType):
-    static_typeid: ClassVar[TypeID]  # value = <mlir._mlir_libs._TypeID object>
+    static_typeid: ClassVar[TypeID]
     @staticmethod
     def get(
         shape: List[int],
diff --git a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
index 6dafe29e2e5f69..d7767bda08435f 100644
--- a/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
+++ b/mlir/test/Conversion/ComplexToStandard/convert-to-standard.mlir
@@ -1,5 +1,5 @@
 // RUN: mlir-opt %s --convert-complex-to-standard --split-input-file |\
-// RUN: FileCheck %s --dump-input=always
+// RUN: FileCheck %s
 
 // CHECK-LABEL: func @complex_abs
 // CHECK-SAME: %[[ARG:.*]]: complex<f32>
@@ -703,14 +703,14 @@ func.func @complex_sqrt_nnan_ninf(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[ABSIM:.*]] = math.absf %[[IM]] fastmath<nnan,ninf> : f32
 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABSRE]], %[[ABSIM]] fastmath<nnan,ninf> : f32
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABSRE]], %[[ABSIM]] fastmath<nnan,ninf> : f32
-// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,ninf> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<nnan,ninf> : f32
-// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<nnan,ninf> : f32
+// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] : f32
+// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] : f32
 // CHECK: %[[QUARTER:.*]] = arith.constant 2.500000e-01 : f32
-// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath<nnan,ninf> : f32
-// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath<nnan,ninf> : f32
-// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath<nnan,ninf> : f32
-// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath<nnan,ninf> : f32
+// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] : f32
+// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] : f32
+// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] : f32
+// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] : f32
 // CHECK: %[[SQRT_ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[SQRT_ABS_OR_NAN]] : f32
 // CHECK: %[[ARGARG:.*]] = math.atan2 %[[IM]], %[[RE]] fastmath<nnan,ninf> : f32
 // CHECK: %[[SQRTARG:.*]] = arith.mulf %[[ARGARG]], %[[HALF]] fastmath<nnan,ninf> : f32
@@ -819,12 +819,12 @@ func.func @complex_abs_with_fmf(%arg: complex<f32>) -> f32 {
 // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<nnan,contract> : f32
-// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<contract> : f32
+// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<contract> : f32
+// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<contract> : f32
+// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<contract> : f32
 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32
 // CHECK: return %[[ABS]] : f32
 
@@ -918,12 +918,12 @@ func.func @complex_log_with_fmf(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<nnan,contract> : f32
-// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<contract> : f32
+// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<contract> : f32
+// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<contract> : f32
+// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<contract> : f32
 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32
 // CHECK: %[[RESULT_REAL:.*]] = math.log %[[ABS]] fastmath<nnan,contract> : f32
 // CHECK: %[[REAL2:.*]] = complex.re %[[ARG]] : complex<f32>
@@ -952,14 +952,14 @@ func.func @complex_log1p_with_fmf(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL_PLUS_ONE]], %[[ABS_IMAG]] fastmath<nnan,contract>  : f32
 // CHECK: %[[CMPF:.*]] = arith.cmpf ogt, %[[REAL_PLUS_ONE]], %[[ABS_IMAG]] fastmath<nnan,contract>  : f32
 // CHECK: %[[MAX_MINUS_ONE:.*]] = arith.subf %[[MAX]], %[[ONE]] fastmath<nnan,contract>  : f32
-// CHECK: %[[SELECT:.*]] = arith.select %[[CMPF]], %0, %[[MAX_MINUS_ONE]] : f32
-// CHECK: %[[MIN_MAX_RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,contract>  : f32
+// CHECK: %[[SELECT:.*]] = arith.select %[[CMPF]], %[[REAL]], %[[MAX_MINUS_ONE]] : f32
+// CHECK: %[[MIN_MAX_RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<contract> : f32
 // CHECK: %[[LOG_1:.*]] = math.log1p %[[SELECT]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[MIN_MAX_RATIO]], %[[MIN_MAX_RATIO]] fastmath<nnan,contract>  : f32
-// CHECK: %[[LOG_SQ:.*]] = math.log1p %[[RATIO_SQ]] fastmath<nnan,contract>  : f32
-// CHECK: %[[HALF_LOG_SQ:.*]] = arith.mulf %cst, %[[LOG_SQ]] fastmath<nnan,contract>  : f32
-// CHECK: %[[R:.*]] = arith.addf %[[HALF_LOG_SQ]], %[[LOG_1]] fastmath<nnan,contract>  : f32
-// CHECK: %[[ISNAN:.*]] = arith.cmpf uno, %[[R]], %[[R]] fastmath<nnan,contract> : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[MIN_MAX_RATIO]], %[[MIN_MAX_RATIO]] fastmath<contract> : f32
+// CHECK: %[[LOG_SQ:.*]] = math.log1p %[[RATIO_SQ]] fastmath<contract> : f32
+// CHECK: %[[HALF_LOG_SQ:.*]] = arith.mulf %cst, %[[LOG_SQ]] fastmath<contract> : f32
+// CHECK: %[[R:.*]] = arith.addf %[[HALF_LOG_SQ]], %[[LOG_1]] fastmath<contract> : f32
+// CHECK: %[[ISNAN:.*]] = arith.cmpf uno, %[[R]], %[[R]] fastmath<contract> : f32
 // CHECK: %[[RESULT_REAL:.*]] = arith.select %[[ISNAN]], %[[MIN]], %[[R]] : f32
 // CHECK: %[[RESULT_IMAG:.*]] = math.atan2 %[[IMAG]], %[[REAL_PLUS_ONE]] fastmath<nnan,contract> : f32
 // CHECK: %[[RESULT:.*]] = complex.create %[[RESULT_REAL]], %[[RESULT_IMAG]] : complex<f32>
@@ -1298,14 +1298,14 @@ func.func @complex_atan2_with_fmf(%lhs: complex<f32>,
 // CHECK: %[[ABSIM:.*]] = math.absf %[[IM]] fastmath<nnan,contract> : f32
 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABSRE]], %[[ABSIM]] fastmath<nnan,contract> : f32
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABSRE]], %[[ABSIM]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<contract> : f32
 // CHECK: %[[QUARTER:.*]] = arith.constant 2.500000e-01 : f32
-// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath<nnan,contract> : f32
-// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath<nnan,contract> : f32
+// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath<contract> : f32
+// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath<contract> : f32
+// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath<contract> : f32
 // CHECK: %[[SQRT_ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[SQRT_ABS_OR_NAN]] : f32
 // CHECK: %[[ARGARG:.*]] = math.atan2 %[[IM]], %[[RE]] fastmath<nnan,contract> : f32
 // CHECK: %[[SQRTARG:.*]] = arith.mulf %[[ARGARG]], %[[HALF]] fastmath<nnan,contract> : f32
@@ -1539,12 +1539,12 @@ func.func @complex_atan2_with_fmf(%lhs: complex<f32>,
 // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<nnan,contract> : f32
-// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<contract> : f32
+// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<contract> : f32
+// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<contract> : f32
+// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<contract> : f32
 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32
 // CHECK: %[[VAR436:.*]] = math.log %[[ABS]] fastmath<nnan,contract> : f32
 // CHECK: %[[VAR437:.*]] = complex.re %[[VAR415]] : complex<f32>
@@ -1778,14 +1778,14 @@ func.func @complex_sqrt_with_fmf(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[ABSIM:.*]] = math.absf %[[IM]] fastmath<nnan,contract> : f32
 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABSRE]], %[[ABSIM]] fastmath<nnan,contract> : f32
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABSRE]], %[[ABSIM]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
+// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<contract> : f32
 // CHECK: %[[QUARTER:.*]] = arith.constant 2.500000e-01 : f32
-// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath<nnan,contract> : f32
-// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath<nnan,contract> : f32
+// CHECK: %[[SQRT_MAX:.*]] = math.sqrt %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[POW:.*]] = math.powf %[[RATIO_SQ_PLUS_ONE]], %[[QUARTER]] fastmath<contract> : f32
+// CHECK: %[[SQRT_ABS_OR_NAN:.*]] = arith.mulf %[[SQRT_MAX]], %[[POW]] fastmath<contract> : f32
+// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[SQRT_ABS_OR_NAN]], %[[SQRT_ABS_OR_NAN]] fastmath<contract> : f32
 // CHECK: %[[SQRT_ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[SQRT_ABS_OR_NAN]] : f32
 // CHECK: %[[ARGARG:.*]] = math.atan2 %[[IM]], %[[RE]] fastmath<nnan,contract> : f32
 // CHECK: %[[SQRTARG:.*]] = arith.mulf %[[ARGARG]], %[[HALF]] fastmath<nnan,contract> : f32
@@ -1886,12 +1886,12 @@ func.func @complex_sign_with_fmf(%arg: complex<f32>) -> complex<f32> {
 // CHECK: %[[ABS_IMAG:.*]] = math.absf %[[IMAG2]] fastmath<nnan,contract> : f32
 // CHECK: %[[MAX:.*]] = arith.maximumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
 // CHECK: %[[MIN:.*]] = arith.minimumf %[[ABS_REAL]], %[[ABS_IMAG]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<nnan,contract> : f32
-// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<nnan,contract> : f32
-// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<nnan,contract> : f32
-// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<nnan,contract> : f32
+// CHECK: %[[RATIO:.*]] = arith.divf %[[MIN]], %[[MAX]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ:.*]] = arith.mulf %[[RATIO]], %[[RATIO]] fastmath<contract> : f32
+// CHECK: %[[RATIO_SQ_PLUS_ONE:.*]] = arith.addf %[[RATIO_SQ]], %[[ONE]] fastmath<contract> : f32
+// CHECK: %[[SQRT:.*]] = math.sqrt %[[RATIO_SQ_PLUS_ONE]] fastmath<contract> : f32
+// CHECK: %[[ABS_OR_NAN:.*]] = arith.mulf %[[MAX]], %[[SQRT]] fastmath<contract> : f32
+// CHECK: %[[IS_NAN:.*]] = arith.cmpf uno, %[[ABS_OR_NAN]], %[[ABS_OR_NAN]] fastmath<contract> : f32
 // CHECK: %[[ABS:.*]] = arith.select %[[IS_NAN]], %[[MIN]], %[[ABS_OR_NAN]] : f32
 // CHECK: %[[REAL_SIGN:.*]] = arith.divf %[[REAL]], %[[ABS]] fastmath<nnan,contract> : f32
 // CHECK: %[[IMAG_SIGN:.*]] = arith.divf %[[IMAG]], %[[ABS]] fastmath<nnan,contract> : f32
diff --git a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
index c0b05ef0860333..6c5c1e09c0eb5f 100644
--- a/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-launch-func-to-gpu-runtime-calls.mlir
@@ -1,15 +1,8 @@
-// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=nvvm.cubin" -split-input-file | FileCheck %s
-// RUN: mlir-opt %s --gpu-to-llvm="gpu-binary-annotation=rocdl.hsaco" -split-input-file | FileCheck %s --check-prefix=ROCDL
+// RUN: mlir-opt %s --gpu-to-llvm -split-input-file | FileCheck %s
 
 module attributes {gpu.container_module} {
-
-  // CHECK: llvm.mlir.global internal constant @[[KERNEL_NAME:.*]]("kernel\00")
-  // CHECK: llvm.mlir.global internal constant @[[GLOBAL:.*]]("CUBIN")
-  // ROCDL: llvm.mlir.global internal constant @[[GLOBAL:.*]]("HSACO")
-
-  gpu.module @kernel_module attributes {
-      nvvm.cubin = "CUBIN", rocdl.hsaco = "HSACO"
-  } {
+  // CHECK: gpu.module
+  gpu.module @kernel_module [#nvvm.target] {
     llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr,
         %arg2: !llvm.ptr, %arg3: i64, %arg4: i64,
         %arg5: i64) attributes {gpu.kernel} {
@@ -18,9 +11,17 @@ module attributes {gpu.container_module} {
   }
 
   func.func @foo(%buffer: memref<?xf32>) {
+  // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64
+  // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32
+  // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
     %c8 = arith.constant 8 : index
     %c32 = arith.constant 32 : i32
     %c256 = arith.constant 256 : i32
+
+  // CHECK: gpu.launch_func @kernel_module::@kernel
+  // CHECK: blocks in ([[C8]], [[C8]], [[C8]]) threads in ([[C8]], [[C8]], [[C8]]) : i64
+  // CHECK: dynamic_shared_memory_size [[C256]]
+  // CHECK: args([[C32]] : i32, %{{.*}} : !llvm.ptr, %{{.*}} : !llvm.ptr, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64)
     gpu.launch_func @kernel_module::@kernel
         blocks in (%c8, %c8, %c8)
         threads in (%c8, %c8, %c8)
@@ -28,46 +29,13 @@ module attributes {gpu.container_module} {
         args(%c32 : i32, %buffer : memref<?xf32>)
     return
   }
-
-  // CHECK-DAG: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
-  // CHECK-DAG: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64
-  // CHECK: [[ADDRESSOF:%.*]] = llvm.mlir.addressof @[[GLOBAL]]
-  // CHECK: [[BINARY:%.*]] = llvm.getelementptr [[ADDRESSOF]]{{\[}}0, 0]
-  // CHECK-SAME: -> !llvm.ptr
-  // CHECK: [[BINARYSIZE:%.*]] = llvm.mlir.constant
-  // CHECK: [[MODULE:%.*]] = llvm.call @mgpuModuleLoad([[BINARY]], [[BINARYSIZE]])
-  // CHECK: [[PARAMSCOUNT:%.*]] = llvm.mlir.constant
-  // CHECK: [[FUNC:%.*]] = llvm.call @mgpuModuleGetFunction([[MODULE]], {{.*}})
-
-  // CHECK: [[STREAM:%.*]] = llvm.call @mgpuStreamCreate
-
-  // CHECK: %[[ONE:.*]] = llvm.mlir.constant(1 : i32)
-  // CHECK: %[[MEMREF:.*]] = llvm.alloca %[[ONE]] x !llvm.struct[[STRUCT_BODY:<.*>]]
-  // CHECK: [[NUM_PARAMS:%.*]] = llvm.mlir.constant(6 : i32) : i32
-  // CHECK-NEXT: [[PARAMS:%.*]] = llvm.alloca [[NUM_PARAMS]] x !llvm.ptr
-
-  // CHECK: llvm.getelementptr %[[MEMREF]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
-  // CHECK: llvm.getelementptr %[[MEMREF]][0, 1] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
-  // CHECK: llvm.getelementptr %[[MEMREF]][0, 2] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
-  // CHECK: llvm.getelementptr %[[MEMREF]][0, 3] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
-  // CHECK: llvm.getelementptr %[[MEMREF]][0, 4] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
-  // CHECK: llvm.getelementptr %[[MEMREF]][0, 5] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct[[STRUCT_BODY:<.*>]]
-
-  // CHECK: [[EXTRA_PARAMS:%.*]] = llvm.mlir.zero : !llvm.ptr
-
-  // CHECK: llvm.call @mgpuLaunchKernel([[FUNC]], [[C8]], [[C8]], [[C8]],
-  // CHECK-SAME: [[C8]], [[C8]], [[C8]], [[C256]], [[STREAM]],
-  // CHECK-SAME: [[PARAMS]], [[EXTRA_PARAMS]], [[PARAMSCOUNT]])
-  // CHECK: llvm.call @mgpuStreamSynchronize
-  // CHECK: llvm.call @mgpuStreamDestroy
-  // CHECK: llvm.call @mgpuModuleUnload
 }
 
+
 // -----
 
 module attributes {gpu.container_module} {
   // CHECK: gpu.module
-  // ROCDL: gpu.module
   gpu.module @kernel_module [#nvvm.target] {
     llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr,
         %arg2: !llvm.ptr, %arg3: i64, %arg4: i64,
@@ -80,15 +48,19 @@ module attributes {gpu.container_module} {
   // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64
   // CHECK: [[C32:%.*]] = llvm.mlir.constant(32 : i32) : i32
   // CHECK: [[C256:%.*]] = llvm.mlir.constant(256 : i32) : i32
-    %c8 = arith.constant 8 : index
+  // CHECK: [[C2:%.*]] = llvm.mlir.constant(2 : index) : i64
+    %c8 = arith.constant 8 : index    
     %c32 = arith.constant 32 : i32
     %c256 = arith.constant 256 : i32
+    %c2 = arith.constant 2 : index
 
   // CHECK: gpu.launch_func @kernel_module::@kernel
+  // CHECK: clusters in ([[C2]], [[C2]], [[C2]])
   // CHECK: blocks in ([[C8]], [[C8]], [[C8]]) threads in ([[C8]], [[C8]], [[C8]]) : i64
   // CHECK: dynamic_shared_memory_size [[C256]]
   // CHECK: args([[C32]] : i32, %{{.*}} : !llvm.ptr, %{{.*}} : !llvm.ptr, %{{.*}} : i64, %{{.*}} : i64, %{{.*}} : i64)
     gpu.launch_func @kernel_module::@kernel
+        clusters in (%c2, %c2, %c2)
         blocks in (%c8, %c8, %c8)
         threads in (%c8, %c8, %c8)
         dynamic_shared_memory_size %c256
@@ -97,18 +69,11 @@ module attributes {gpu.container_module} {
   }
 }
 
-
 // -----
 
 module attributes {gpu.container_module} {
-  // CHECK: gpu.module
-  gpu.module @kernel_module [#nvvm.target] {
-    llvm.func @kernel(%arg0: i32, %arg1: !llvm.ptr,
-        %arg2: !llvm.ptr, %arg3: i64, %arg4: i64,
-        %arg5: i64) attributes {gpu.kernel} {
-      llvm.return
-    }
-  }
+  // CHECK: gpu.binary
+  gpu.binary @kernel_module [#gpu.object<#rocdl.target, "blob">]
 
   func.func @foo(%buffer: memref<?xf32>) {
   // CHECK: [[C8:%.*]] = llvm.mlir.constant(8 : index) : i64
diff --git a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
index 2f229da44f9909..eb359b5ffd50a9 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-reduction.mlir
@@ -357,7 +357,6 @@ module {
     %0 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "reduction"]} ins(%arg0 : tensor<?x?xf32>) outs(%arg1 : tensor<?xf32>) {
     ^bb0(%in: f32, %out: f32):
       %1 = llvm.fmul %in, %in  : f32
-      // expected-error @below {{Unknown neutral element for:}}
       %2 = llvm.fadd %1, %out  : f32
       linalg.yield %2 : f32
     } -> tensor<?xf32>
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
index 4d82021e86f5bb..9bb87ffbb20905 100644
--- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s --check-prefixes=CHECK,PARALLEL-CHECK
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only check-parallel-regions=false" -split-input-file | FileCheck %s --check-prefixes=CHECK,NO-PARALLEL-CHECK
 
 // Run fuzzer with different seeds.
 // RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs-from-loops bufferize-function-boundaries test-analysis-only analysis-heuristic=fuzzer analysis-fuzzer-seed=23" -split-input-file -o /dev/null
@@ -811,8 +812,10 @@ func.func @parallel_region() -> tensor<320xf32>
   %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
     %val = "test.foo"() : () -> (f32)
     // linalg.fill must bufferize out-of-place because every thread needs a
-    // private copy of %alloc1.
-    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    // private copy of %alloc1. If not accounting for parallel regions, the fill
+    // can bufferize in place.
+    // PARALLEL-CHECK:    linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
     %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
     scf.forall.in_parallel {
       // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
@@ -841,8 +844,10 @@ func.func @parallel_region_mixed_def(%c: i1) -> tensor<320xf32>
     }
     %val = "test.foo"() : () -> (f32)
     // linalg.fill must bufferize out-of-place because every thread needs a
-    // private copy of %alloc1.
-    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    // private copy of %alloc1. If not accounting for parallel regions, the fill
+    // can bufferize in place.
+    // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
     %fill = linalg.fill ins(%val : f32) outs(%selected : tensor<1xf32>) -> tensor<1xf32>
     scf.forall.in_parallel {
       // CHECK: tensor.parallel_insert_slice {{.*}} {__inplace_operands_attr__ = ["true", "true", "none"]}
@@ -866,8 +871,10 @@ func.func @parallel_region_two_writes(%f: f32) -> tensor<320xf32>
   %0 = scf.forall (%arg0) in (%c320) shared_outs(%arg1 = %alloc0) -> (tensor<320xf32>) {
     %val = "test.foo"() : () -> (f32)
     // linalg.fill must bufferize out-of-place because every thread needs a
-    // private copy of %alloc1.
-    // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    // private copy of %alloc1. If not accounting for parallel regions, the fill
+    // can bufferize in place.
+    // PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "false"]}
+    // NO-PARALLEL-CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]}
     %fill = linalg.fill ins(%val : f32) outs(%alloc1 : tensor<1xf32>) -> tensor<1xf32>
     // CHECK: tensor.insert
     // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"]
diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index b4cb640108baee..9b23681dba6a8a 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -1,12 +1,17 @@
 // RUN: mlir-opt %s -test-vector-transfer-collapse-inner-most-dims -split-input-file | FileCheck %s
 
-func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
+//-----------------------------------------------------------------------------
+// 1. vector.transfer_read
+//-----------------------------------------------------------------------------
+
+func.func @contiguous_inner_most(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
   %c0 = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
   %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32>
   return %0 : vector<1x8x1xf32>
 }
-//      CHECK: func @contiguous_inner_most_view(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>
+
+//      CHECK: func @contiguous_inner_most(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>
 //      CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
 // CHECK-SAME:    memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> to memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>>
 //      CHECK:   %[[VEC:.+]] = vector.transfer_read %[[SRC_0]]
@@ -14,15 +19,61 @@ func.func @contiguous_inner_most_view(%in: memref<1x1x8x1xf32, strided<[3072, 8,
 //      CHECK:   %[[RESULT:.+]] = vector.shape_cast %[[VEC]]
 //      CHECK:   return %[[RESULT]]
 
+// Same as the top example within this split, but with the inner vector
+// dim scalable. Note that this example only makes sense when "8 = [8]" (i.e.
+// vscale = 1). This is assumed (implicitly) via the `in_bounds` attribute.
+
+func.func @contiguous_inner_most_scalable_inner_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x[8]x1xf32>{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x[8]x1xf32>
+  return %0 : vector<1x[8]x1xf32>
+}
+
+//      CHECK: func @contiguous_inner_most_scalable_inner_dim(%[[SRC:.+]]: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>
+//      CHECK:   %[[SRC_0:.+]] = memref.subview %[[SRC]]
+// CHECK-SAME:    memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>> to memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>>
+//      CHECK:   %[[VEC:.+]] = vector.transfer_read %[[SRC_0]]
+// CHECK-SAME:    memref<1x1x8xf32, strided<[3072, 8, 1], offset: ?>>, vector<1x[8]xf32>
+//      CHECK:   %[[RESULT:.+]] = vector.shape_cast %[[VEC]]
+//      CHECK:   return %[[RESULT]]
+
+// Same as the top example within this split, but the trailing unit dim was
+// replaced with a dyn dim - not supported
+
+func.func @non_unit_trailing_dim(%in: memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x1xf32>{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x?xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x1xf32>
+  return %0 : vector<1x8x1xf32>
+}
+
+//  CHECK-LABEL: func @non_unit_trailing_dim
+//    CHECK-NOT: memref.subview
+//    CHECK-NOT: vector.shape_cast
+
+// Same as the top example within this split, but with a scalable unit dim in
+// the output vector - not supported
+
+func.func @negative_scalable_unit_dim(%in: memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>) -> vector<1x8x[1]xf32>{
+  %c0 = arith.constant 0 : index
+  %cst = arith.constant 0.0 : f32
+  %0 = vector.transfer_read %in[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true]} : memref<1x1x8x1xf32, strided<[3072, 8, 1, 1], offset: ?>>, vector<1x8x[1]xf32>
+  return %0 : vector<1x8x[1]xf32>
+}
+//  CHECK-LABEL: func @negative_scalable_unit_dim
+//    CHECK-NOT: memref.subview
+//    CHECK-NOT: vector.shape_cast
+
 // -----
 
-func.func @contiguous_outer_dyn_inner_most_view(%a: index, %b: index, %memref: memref<?x?x8x1xf32>) -> vector<8x1xf32> {
+func.func @contiguous_outer_dyn_inner_most(%a: index, %b: index, %memref: memref<?x?x8x1xf32>) -> vector<8x1xf32> {
   %c0 = arith.constant 0 : index
   %pad = arith.constant 0.0 : f32
   %v = vector.transfer_read %memref[%a, %b, %c0, %c0], %pad {in_bounds = [true, true]} : memref<?x?x8x1xf32>, vector<8x1xf32>
   return %v : vector<8x1xf32>
 }
-// CHECK: func.func @contiguous_outer_dyn_inner_most_view(
+// CHECK: func.func @contiguous_outer_dyn_inner_most(
 // CHECK-SAME:   %[[IDX0:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[IDX1:[a-zA-Z0-9]+]]
 // CHECK-SAME:   %[[SRC:[a-zA-Z0-9]+]]
@@ -103,6 +154,10 @@ func.func @contiguous_inner_most_dim_out_of_bounds_2d(%arg0: memref<1x1xf32>) ->
 
 // -----
 
+//-----------------------------------------------------------------------------
+// 2. vector.transfer_write
+//-----------------------------------------------------------------------------
+
 func.func @drop_two_inner_most_dim_for_transfer_write(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector<1x16x16x1x1xf32>, %arg2: index) {
   %c0 = arith.constant 0 : index
   vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0]
@@ -177,21 +232,6 @@ func.func @non_unit_strides(%arg0: memref<512x16x1xf32, strided<[8192, 16, 4], o
 
 // -----
 
-func.func @leading_scalable_dimension_transfer_read(%dest : memref<24x1xf32>) -> vector<[4]x1xf32> {
-  %c0 = arith.constant 0 : index
-  %pad = arith.constant 0.0 : f32
-  %0 = vector.transfer_read %dest[%c0, %c0], %pad {in_bounds = [true, true]} : memref<24x1xf32>, vector<[4]x1xf32>
-  return %0 : vector<[4]x1xf32>
-}
-// CHECK:      func.func @leading_scalable_dimension_transfer_read
-// CHECK-SAME:   %[[DEST:[a-zA-Z0-9]+]]
-// CHECK:        %[[SUBVIEW:.+]] = memref.subview %[[DEST]][0, 0] [24, 1] [1, 1] : memref<24x1xf32> to memref<24xf32, strided<[1]>>
-// CHECK:        %[[READ:.+]] = vector.transfer_read %[[SUBVIEW]]{{.*}} {in_bounds = [true]} : memref<24xf32, strided<[1]>>, vector<[4]xf32>
-// CHECK:        %[[CAST:.+]] = vector.shape_cast %[[READ]] : vector<[4]xf32> to vector<[4]x1xf32>
-// CHECK:        return %[[CAST]]
-
-// -----
-
 // Negative test: [1] (scalable 1) is _not_ a unit dimension.
 func.func @trailing_scalable_one_dim_transfer_read(%dest : memref<24x1xf32>) -> vector<4x[1]xf32> {
   %c0 = arith.constant 0 : index
diff --git a/mlir/test/IR/print-attr-type-aliases.mlir b/mlir/test/IR/print-attr-type-aliases.mlir
index 162eacd0022832..27c5a75addbb59 100644
--- a/mlir/test/IR/print-attr-type-aliases.mlir
+++ b/mlir/test/IR/print-attr-type-aliases.mlir
@@ -11,6 +11,9 @@
 // CHECK-DAG: #_0_test_alias = "alias_test:prefixed_digit"
 "test.op"() {alias_test = "alias_test:prefixed_digit"} : () -> ()
 
+// CHECK-DAG: #_25test = "alias_test:prefixed_symbol"
+"test.op"() {alias_test = "alias_test:prefixed_symbol"} : () -> ()
+
 // CHECK-DAG: #test_alias_conflict0_ = "alias_test:sanitize_conflict_a"
 // CHECK-DAG: #test_alias_conflict0_1 = "alias_test:sanitize_conflict_b"
 "test.op"() {alias_test = ["alias_test:sanitize_conflict_a", "alias_test:sanitize_conflict_b"]} : () -> ()
diff --git a/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp b/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp
index a3a8913d5964c6..64add8cef36986 100644
--- a/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialectInterfaces.cpp
@@ -188,6 +188,7 @@ struct TestOpAsmInterface : public OpAsmDialectInterface {
             .Case("alias_test:dot_in_name", StringRef("test.alias"))
             .Case("alias_test:trailing_digit", StringRef("test_alias0"))
             .Case("alias_test:prefixed_digit", StringRef("0_test_alias"))
+            .Case("alias_test:prefixed_symbol", StringRef("%test"))
             .Case("alias_test:sanitize_conflict_a",
                   StringRef("test_alias_conflict0"))
             .Case("alias_test:sanitize_conflict_b",
diff --git a/offload/include/Shared/APITypes.h b/offload/include/Shared/APITypes.h
index e8fc27785b6c2e..a84b685eeedcec 100644
--- a/offload/include/Shared/APITypes.h
+++ b/offload/include/Shared/APITypes.h
@@ -89,22 +89,26 @@ struct __tgt_async_info {
 
 /// This struct contains all of the arguments to a target kernel region launch.
 struct KernelArgsTy {
-  uint32_t Version;   // Version of this struct for ABI compatibility.
-  uint32_t NumArgs;   // Number of arguments in each input pointer.
-  void **ArgBasePtrs; // Base pointer of each argument (e.g. a struct).
-  void **ArgPtrs;     // Pointer to the argument data.
-  int64_t *ArgSizes;  // Size of the argument data in bytes.
-  int64_t *ArgTypes;  // Type of the data (e.g. to / from).
-  void **ArgNames;    // Name of the data for debugging, possibly null.
-  void **ArgMappers;  // User-defined mappers, possibly null.
-  uint64_t Tripcount; // Tripcount for the teams / distribute loop, 0 otherwise.
+  uint32_t Version = 0; // Version of this struct for ABI compatibility.
+  uint32_t NumArgs = 0; // Number of arguments in each input pointer.
+  void **ArgBasePtrs =
+      nullptr;                 // Base pointer of each argument (e.g. a struct).
+  void **ArgPtrs = nullptr;    // Pointer to the argument data.
+  int64_t *ArgSizes = nullptr; // Size of the argument data in bytes.
+  int64_t *ArgTypes = nullptr; // Type of the data (e.g. to / from).
+  void **ArgNames = nullptr;   // Name of the data for debugging, possibly null.
+  void **ArgMappers = nullptr; // User-defined mappers, possibly null.
+  uint64_t Tripcount =
+      0; // Tripcount for the teams / distribute loop, 0 otherwise.
   struct {
     uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
     uint64_t Unused : 63;
-  } Flags;
-  uint32_t NumTeams[3];    // The number of teams (for x,y,z dimension).
-  uint32_t ThreadLimit[3]; // The number of threads (for x,y,z dimension).
-  uint32_t DynCGroupMem;   // Amount of dynamic cgroup memory requested.
+  } Flags = {0, 0};
+  // The number of teams (for x,y,z dimension).
+  uint32_t NumTeams[3] = {0, 0, 0};
+   // The number of threads (for x,y,z dimension).
+  uint32_t ThreadLimit[3] = {0, 0, 0};
+  uint32_t DynCGroupMem = 0;     // Amount of dynamic cgroup memory requested.
 };
 static_assert(sizeof(KernelArgsTy().Flags) == sizeof(uint64_t),
               "Invalid struct size");
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
index fac692addb9e5a..9f8e217bbd08be 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/BUILD.bazel
@@ -378,6 +378,7 @@ libc_support_library(
     name = "rint_test_template",
     hdrs = ["RIntTest.h"],
     deps = [
+        "//libc:__support_cpp_algorithm",
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
         "//libc:hdr_fenv_macros",
@@ -416,6 +417,7 @@ libc_support_library(
     name = "round_to_integer_test_template",
     hdrs = ["RoundToIntegerTest.h"],
     deps = [
+        "//libc:__support_cpp_algorithm",
         "//libc:__support_fputil_fenv_impl",
         "//libc:__support_fputil_fp_bits",
         "//libc:hdr_math_macros",
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index fdf89d00cbb1d1..448d2ed9201acf 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -438,6 +438,7 @@ cc_library(
     srcs = ["lib/Dialect/Test/TestToLLVMIRTranslation.cpp"],
     deps = [
         ":TestDialect",
+        "//llvm:Core",
         "//llvm:Support",
         "//mlir:BuiltinToLLVMIRTranslation",
         "//mlir:IR",