From 486644723038555a224fd09d462bb5099e64809e Mon Sep 17 00:00:00 2001
From: Vladislav Belov <vladislav.belov@syntacore.com>
Date: Tue, 26 Nov 2024 14:56:46 +0300
Subject: [PATCH 01/41] [Clang] Fix name lookup for dependent bases (#114978)

Currently the following example is a compilation failure:
```cpp
template<typename T> struct A {
    typedef int M;
    struct B {
      typedef void M;
      struct C;
    };
};

template<typename T> struct A<T>::B::C : A<T> {
    M m; // void or int ?
};
```

According to the point 13.8.3.2

```
A dependent base class is a base class that is a dependent type and is not the current instantiation.
Note 2 : A base class can be the current instantiation in the case of a nested class naming an enclosing class as a base.
```

The base class `A` is the current instantiation, because `C` is a nested
class for an enclosing class `A<T>`, it's is the not-dependent base
class and we need to search the names through its scope.

This patch makes this example compile
---
 clang/docs/ReleaseNotes.rst      |  3 ++
 clang/lib/AST/CXXInheritance.cpp | 18 ++++++++----
 clang/test/CXX/drs/cwg5xx.cpp    | 48 ++++++++++++++++++++++++++++++--
 clang/www/cxx_dr_status.html     |  2 +-
 4 files changed, 62 insertions(+), 9 deletions(-)
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 954fe61f3d1d69..ff24161de493c7 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -279,6 +279,9 @@ Resolutions to C++ Defect Reports
   by default.
   (`CWG2521: User-defined literals and reserved identifiers <https://cplusplus.github.io/CWG/issues/2521.html>`_).
 
+- Fix name lookup for a dependent base class that is the current instantiation.  
+  (`CWG591: When a dependent base class is the current instantiation <https://cplusplus.github.io/CWG/issues/591.html>`_).
+
 C Language Changes
 ------------------
 
diff --git a/clang/lib/AST/CXXInheritance.cpp b/clang/lib/AST/CXXInheritance.cpp
index aefc06e9197cfb..10b8d524ff8978 100644
--- a/clang/lib/AST/CXXInheritance.cpp
+++ b/clang/lib/AST/CXXInheritance.cpp
@@ -134,7 +134,7 @@ bool CXXRecordDecl::forallBases(ForallBasesCallback BaseMatches) const {
         return false;
 
       CXXRecordDecl *Base =
-            cast_or_null<CXXRecordDecl>(Ty->getDecl()->getDefinition());
+          cast_if_present<CXXRecordDecl>(Ty->getDecl()->getDefinition());
       if (!Base ||
           (Base->isDependentContext() &&
            !Base->isCurrentInstantiation(Record))) {
@@ -169,13 +169,21 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
     QualType BaseType =
         Context.getCanonicalType(BaseSpec.getType()).getUnqualifiedType();
 
+    bool isCurrentInstantiation = isa<InjectedClassNameType>(BaseType);
+    if (!isCurrentInstantiation) {
+      if (auto *BaseRecord = cast_if_present<CXXRecordDecl>(
+              BaseSpec.getType()->getAsRecordDecl()))
+        isCurrentInstantiation = BaseRecord->isDependentContext() &&
+                                 BaseRecord->isCurrentInstantiation(Record);
+    }
     // C++ [temp.dep]p3:
     //   In the definition of a class template or a member of a class template,
     //   if a base class of the class template depends on a template-parameter,
     //   the base class scope is not examined during unqualified name lookup
     //   either at the point of definition of the class template or member or
     //   during an instantiation of the class tem- plate or member.
-    if (!LookupInDependent && BaseType->isDependentType())
+    if (!LookupInDependent &&
+        (BaseType->isDependentType() && !isCurrentInstantiation))
       continue;
 
     // Determine whether we need to visit this base class at all,
@@ -243,9 +251,8 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
         return FoundPath;
       }
     } else if (VisitBase) {
-      CXXRecordDecl *BaseRecord;
+      CXXRecordDecl *BaseRecord = nullptr;
       if (LookupInDependent) {
-        BaseRecord = nullptr;
         const TemplateSpecializationType *TST =
             BaseSpec.getType()->getAs<TemplateSpecializationType>();
         if (!TST) {
@@ -264,8 +271,7 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
             BaseRecord = nullptr;
         }
       } else {
-        BaseRecord = cast<CXXRecordDecl>(
-            BaseSpec.getType()->castAs<RecordType>()->getDecl());
+        BaseRecord = cast<CXXRecordDecl>(BaseSpec.getType()->getAsRecordDecl());
       }
       if (BaseRecord &&
           lookupInBases(Context, BaseRecord, BaseMatches, LookupInDependent)) {
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index ed0c7159dfc889..0d53a9d07d76de 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -1178,17 +1178,61 @@ namespace cwg590 { // cwg590: yes
   template<typename T> typename A<T>::B::C A<T>::B::C::f(A<T>::B::C) {}
 }
 
-namespace cwg591 { // cwg591: no
+namespace cwg591 { // cwg591: yes
   template<typename T> struct A {
     typedef int M;
     struct B {
       typedef void M;
       struct C;
+      struct D;
+    };
+  };
+
+  template<typename T> struct G {
+    struct B {
+      typedef int M;
+      struct C {
+        typedef void M;
+        struct D;
+      };
+    };
+  };
+
+  template<typename T> struct H {
+    template<typename U> struct B {
+      typedef int M;
+      template<typename F> struct C {
+        typedef void M;
+        struct D;
+        struct P;
+      };
     };
   };
 
   template<typename T> struct A<T>::B::C : A<T> {
-    // FIXME: Should find member of non-dependent base class A<T>.
+    M m;
+  };
+
+  template<typename T> struct G<T>::B::C::D : B {
+    M m;
+  };
+
+  template<typename T>
+  template<typename U>
+  template<typename F>
+  struct H<T>::B<U>::C<F>::D : B<U> {
+    M m;
+  };
+
+  template<typename T> struct A<T>::B::D : A<T*> {
+    M m;
+    // expected-error@-1 {{field has incomplete type 'M' (aka 'void'}}
+  };
+
+  template<typename T>
+  template<typename U>
+  template<typename F>
+  struct H<T>::B<U>::C<F>::P : B<F> {
     M m;
     // expected-error@-1 {{field has incomplete type 'M' (aka 'void'}}
   };
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 186f7cc0ace546..c773c58fac4d0f 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -3599,7 +3599,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
     <td>CD4</td>
     <td>When a dependent base class is the current instantiation</td>
-    <td class="none" align="center">No</td>
+    <td class="none" align="center">Yes</td>
   </tr>
   <tr id="592">
     <td><a href="https://cplusplus.github.io/CWG/issues/592.html">592</a></td>

From ec4c47d9490c90a98f2dda3fc9ff7c51782678f8 Mon Sep 17 00:00:00 2001
From: ykiko <ykikoykikoykiko@gmail.com>
Date: Tue, 26 Nov 2024 20:03:39 +0800
Subject: [PATCH 02/41] Add code completion for C++20 keywords. (#107982)

This commit adds code completion for C++20 keywords, fix
https://github.com/llvm/llvm-project/issues/107868.

1. complete `concept` in template context
    - [x] `template<typename T> conce^` -> `concept`
    - [ ] `conce^`

2. complete `requires`
- [x] constraints in template context: `template<typename T> requi^` ->
`requires`
- [x] requires expression: `int x = requ^` -> `requires (parameters) {
requirements }`
- [x] nested requirement: `requires { requ^ }` -> `requires expression
;`

3. complete coroutine keywords
    - [x] `co_await^` in expression: `co_aw^` -> `co_await expression;`
- [x] `co_yield` in function body: `co_yi^` -> `co_yield expression;`
- [x] `co_return` in function body: `co_re^` -> `co_return expression;`

4. specifiers: `char8_t`, `consteval`, `constinit`
---
 clang-tools-extra/docs/ReleaseNotes.rst      |   2 +
 clang/lib/Parse/ParseDeclCXX.cpp             |   9 ++
 clang/lib/Sema/SemaCodeComplete.cpp          | 128 +++++++++++++++++++
 clang/test/CodeCompletion/keywords-cxx20.cpp |  57 +++++++++
 4 files changed, 196 insertions(+)
 create mode 100644 clang/test/CodeCompletion/keywords-cxx20.cpp

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index fec2c20206bc4d..f8507156aa4198 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -73,6 +73,8 @@ Hover
 Code completion
 ^^^^^^^^^^^^^^^
 
+- Added completion for C++20 keywords.
+
 Code actions
 ^^^^^^^^^^^^
 
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index 010ac9c1a3e3a9..f30603feb65c5d 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -460,6 +460,15 @@ Decl *Parser::ParseExportDeclaration() {
   assert(Tok.is(tok::kw_export));
   SourceLocation ExportLoc = ConsumeToken();
 
+  if (Tok.is(tok::code_completion)) {
+    cutOffParsing();
+    Actions.CodeCompletion().CodeCompleteOrdinaryName(
+        getCurScope(), PP.isIncrementalProcessingEnabled()
+                           ? SemaCodeCompletion::PCC_TopLevelOrExpression
+                           : SemaCodeCompletion::PCC_Namespace);
+    return nullptr;
+  }
+
   ParseScope ExportScope(this, Scope::DeclScope);
   Decl *ExportDecl = Actions.ActOnStartExportDecl(
       getCurScope(), ExportLoc,
diff --git a/clang/lib/Sema/SemaCodeComplete.cpp b/clang/lib/Sema/SemaCodeComplete.cpp
index 12da3a2cbca314..60ea1383b2a6ee 100644
--- a/clang/lib/Sema/SemaCodeComplete.cpp
+++ b/clang/lib/Sema/SemaCodeComplete.cpp
@@ -1836,6 +1836,9 @@ static void AddTypeSpecifierResults(const LangOptions &LangOpts,
       Builder.AddChunk(CodeCompletionString::CK_RightParen);
       Results.AddResult(Result(Builder.TakeString()));
     }
+
+    if (LangOpts.Char8 || LangOpts.CPlusPlus20)
+      Results.AddResult(Result("char8_t", CCP_Type));
   } else
     Results.AddResult(Result("__auto_type", CCP_Type));
 
@@ -1888,6 +1891,9 @@ AddStorageSpecifiers(SemaCodeCompletion::ParserCompletionContext CCC,
     Results.AddResult(Result("constexpr"));
     Results.AddResult(Result("thread_local"));
   }
+
+  if (LangOpts.CPlusPlus20)
+    Results.AddResult(Result("constinit"));
 }
 
 static void
@@ -1911,6 +1917,9 @@ AddFunctionSpecifiers(SemaCodeCompletion::ParserCompletionContext CCC,
   case SemaCodeCompletion::PCC_Template:
     if (LangOpts.CPlusPlus || LangOpts.C99)
       Results.AddResult(Result("inline"));
+
+    if (LangOpts.CPlusPlus20)
+      Results.AddResult(Result("consteval"));
     break;
 
   case SemaCodeCompletion::PCC_ObjCInstanceVariableList:
@@ -2186,6 +2195,69 @@ AddOrdinaryNameResults(SemaCodeCompletion::ParserCompletionContext CCC,
       } else {
         Results.AddResult(Result("template", CodeCompletionResult::RK_Keyword));
       }
+
+      if (SemaRef.getLangOpts().CPlusPlus20 &&
+          SemaRef.getLangOpts().CPlusPlusModules) {
+        clang::Module *CurrentModule = SemaRef.getCurrentModule();
+        if (SemaRef.CurContext->isTranslationUnit()) {
+          /// Global module fragment can only be declared in the beginning of
+          /// the file. CurrentModule should be null in this case.
+          if (!CurrentModule) {
+            // module;
+            Builder.AddTypedTextChunk("module");
+            Builder.AddChunk(CodeCompletionString::CK_SemiColon);
+            Builder.AddChunk(CodeCompletionString::CK_VerticalSpace);
+            Results.AddResult(Result(Builder.TakeString()));
+          }
+
+          /// Named module should be declared in the beginning of the file,
+          /// or after the global module fragment.
+          if (!CurrentModule ||
+              CurrentModule->Kind == Module::ExplicitGlobalModuleFragment ||
+              CurrentModule->Kind == Module::ImplicitGlobalModuleFragment) {
+            // export module;
+            // module name;
+            Builder.AddTypedTextChunk("module");
+            Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+            Builder.AddPlaceholderChunk("name");
+            Builder.AddChunk(CodeCompletionString::CK_SemiColon);
+            Builder.AddChunk(CodeCompletionString::CK_VerticalSpace);
+            Results.AddResult(Result(Builder.TakeString()));
+          }
+
+          /// Import can occur in non module file or after the named module
+          /// declaration.
+          if (!CurrentModule ||
+              CurrentModule->Kind == Module::ModuleInterfaceUnit ||
+              CurrentModule->Kind == Module::ModulePartitionInterface) {
+            // import name;
+            Builder.AddTypedTextChunk("import");
+            Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+            Builder.AddPlaceholderChunk("name");
+            Builder.AddChunk(CodeCompletionString::CK_SemiColon);
+            Builder.AddChunk(CodeCompletionString::CK_VerticalSpace);
+            Results.AddResult(Result(Builder.TakeString()));
+          }
+
+          if (CurrentModule &&
+              (CurrentModule->Kind == Module::ModuleInterfaceUnit ||
+               CurrentModule->Kind == Module::ModulePartitionInterface)) {
+            // module: private;
+            Builder.AddTypedTextChunk("module");
+            Builder.AddChunk(CodeCompletionString::CK_Colon);
+            Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+            Builder.AddTypedTextChunk("private");
+            Builder.AddChunk(CodeCompletionString::CK_SemiColon);
+            Builder.AddChunk(CodeCompletionString::CK_VerticalSpace);
+            Results.AddResult(Result(Builder.TakeString()));
+          }
+        }
+
+        // export
+        if (!CurrentModule ||
+            CurrentModule->Kind != Module::ModuleKind::PrivateModuleFragment)
+          Results.AddResult(Result("export", CodeCompletionResult::RK_Keyword));
+      }
     }
 
     if (SemaRef.getLangOpts().ObjC)
@@ -2253,6 +2325,11 @@ AddOrdinaryNameResults(SemaCodeCompletion::ParserCompletionContext CCC,
     [[fallthrough]];
 
   case SemaCodeCompletion::PCC_Template:
+    if (SemaRef.getLangOpts().CPlusPlus20 &&
+        CCC == SemaCodeCompletion::PCC_Template)
+      Results.AddResult(Result("concept", CCP_Keyword));
+    [[fallthrough]];
+
   case SemaCodeCompletion::PCC_MemberTemplate:
     if (SemaRef.getLangOpts().CPlusPlus && Results.includeCodePatterns()) {
       // template < parameters >
@@ -2265,6 +2342,11 @@ AddOrdinaryNameResults(SemaCodeCompletion::ParserCompletionContext CCC,
       Results.AddResult(Result("template", CodeCompletionResult::RK_Keyword));
     }
 
+    if (SemaRef.getLangOpts().CPlusPlus20 &&
+        (CCC == SemaCodeCompletion::PCC_Template ||
+         CCC == SemaCodeCompletion::PCC_MemberTemplate))
+      Results.AddResult(Result("requires", CCP_Keyword));
+
     AddStorageSpecifiers(CCC, SemaRef.getLangOpts(), Results);
     AddFunctionSpecifiers(CCC, SemaRef.getLangOpts(), Results);
     break;
@@ -2486,6 +2568,14 @@ AddOrdinaryNameResults(SemaCodeCompletion::ParserCompletionContext CCC,
       Builder.AddPlaceholderChunk("expression");
       Builder.AddChunk(CodeCompletionString::CK_SemiColon);
       Results.AddResult(Result(Builder.TakeString()));
+      // "co_return expression ;" for coroutines(C++20).
+      if (SemaRef.getLangOpts().CPlusPlus20) {
+        Builder.AddTypedTextChunk("co_return");
+        Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+        Builder.AddPlaceholderChunk("expression");
+        Builder.AddChunk(CodeCompletionString::CK_SemiColon);
+        Results.AddResult(Result(Builder.TakeString()));
+      }
       // When boolean, also add 'return true;' and 'return false;'.
       if (ReturnType->isBooleanType()) {
         Builder.AddTypedTextChunk("return true");
@@ -2706,6 +2796,44 @@ AddOrdinaryNameResults(SemaCodeCompletion::ParserCompletionContext CCC,
         Builder.AddChunk(CodeCompletionString::CK_RightParen);
         Results.AddResult(Result(Builder.TakeString()));
       }
+
+      if (SemaRef.getLangOpts().CPlusPlus20) {
+        // co_await expression
+        Builder.AddTypedTextChunk("co_await");
+        Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+        Builder.AddPlaceholderChunk("expression");
+        Results.AddResult(Result(Builder.TakeString()));
+
+        // co_yield expression
+        Builder.AddTypedTextChunk("co_yield");
+        Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+        Builder.AddPlaceholderChunk("expression");
+        Results.AddResult(Result(Builder.TakeString()));
+
+        // requires (parameters) { requirements }
+        Builder.AddResultTypeChunk("bool");
+        Builder.AddTypedTextChunk("requires");
+        Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+        Builder.AddChunk(CodeCompletionString::CK_LeftParen);
+        Builder.AddPlaceholderChunk("parameters");
+        Builder.AddChunk(CodeCompletionString::CK_RightParen);
+        Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+        Builder.AddChunk(CodeCompletionString::CK_LeftBrace);
+        Builder.AddChunk(CodeCompletionString::CK_VerticalSpace);
+        Builder.AddPlaceholderChunk("requirements");
+        Builder.AddChunk(CodeCompletionString::CK_VerticalSpace);
+        Builder.AddChunk(CodeCompletionString::CK_RightBrace);
+        Results.AddResult(Result(Builder.TakeString()));
+
+        if (SemaRef.CurContext->isRequiresExprBody()) {
+          // requires expression ;
+          Builder.AddTypedTextChunk("requires");
+          Builder.AddChunk(CodeCompletionString::CK_HorizontalSpace);
+          Builder.AddPlaceholderChunk("expression");
+          Builder.AddChunk(CodeCompletionString::CK_SemiColon);
+          Results.AddResult(Result(Builder.TakeString()));
+        }
+      }
     }
 
     if (SemaRef.getLangOpts().ObjC) {
diff --git a/clang/test/CodeCompletion/keywords-cxx20.cpp b/clang/test/CodeCompletion/keywords-cxx20.cpp
new file mode 100644
index 00000000000000..612c3c0045e394
--- /dev/null
+++ b/clang/test/CodeCompletion/keywords-cxx20.cpp
@@ -0,0 +1,57 @@
+module;
+
+export module M;
+
+export const char8_t x = 1;
+
+template<typename T> requires true
+const int y = requires { typename T::type; requires T::value; };
+
+class co_test {};
+
+int f(){ co_test test; return 1; }
+
+module: private;
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:1:3 %s | FileCheck --check-prefix=CHECK-MODULE1 %s
+// CHECK-MODULE1: module;
+// CHECK-MODULE1: module <#name#>;
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:3:11 %s | FileCheck --check-prefix=CHECK-MODULE2 %s
+// CHECK-MODULE2: module <#name#>;
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:14:3 %s | FileCheck --check-prefix=CHECK-MODULE3 %s
+// CHECK-MODULE3: module: private;
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:3:3 %s | FileCheck --check-prefix=CHECK-EXPORT %s
+// CHECK-EXPORT: export
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:5:11 %s | FileCheck --check-prefix=CHECK-CONST %s
+// CHECK-CONST: const
+// CHECK-CONST: consteval
+// CHECK-CONST: constexpr
+// CHECK-CONST: constinit
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:5:19 %s | FileCheck --check-prefix=CHECK-CHAR %s
+// CHECK-CHAR: char8_t
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:8:3 %s | FileCheck --check-prefix=CHECK-CONSTRAINT %s
+// CHECK-CONSTRAINT: concept
+// CHECK-CONSTRAINT: const
+// CHECK-CONSTRAINT: consteval
+// CHECK-CONSTRAINT: constexpr
+// CHECK-CONSTRAINT: constinit
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:7:27 %s | FileCheck --check-prefix=CHECK-REQUIRES2 %s
+// CHECK-REQUIRES2: requires
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:8:20 %s | FileCheck -check-prefix=CHECK-REQUIRE %s
+// CHECK-REQUIRE: [#bool#]requires (<#parameters#>) {
+// CHECK-REQUIRE: <#requirements#>
+// CHECK-REQUIRE: }
+
+// RUN: %clang_cc1 -std=c++20 -code-completion-at=%s:12:13 %s | FileCheck --check-prefix=CHECK-COROUTINE %s
+// CHECK-COROUTINE: co_await <#expression#>
+// CHECK-COROUTINE: co_return <#expression#>;
+// CHECK-COROUTINE: co_yield <#expression#>
+

From 90df66455b2ff6a3b3754a56afafc05935a05e15 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 26 Nov 2024 11:57:30 +0000
Subject: [PATCH 03/41] [MCA][X86] Fix throughput of (V)PMOV
 extension/truncation 512-bit instructions

znver4 512-bit instructions are half rate of 128/256-bit variants (still 1uop though)

Confirmed with Agner/uops.info

Noticed while triaging #110308 and #117579
---
 llvm/lib/Target/X86/X86ScheduleZnver4.td      |   4 +-
 .../llvm-mca/X86/Znver4/resources-avx512.s    | 302 +++++++++---------
 .../llvm-mca/X86/Znver4/resources-avx512bw.s  |  62 ++--
 3 files changed, 184 insertions(+), 184 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index bb412fa88b0f12..38f9b5ef1d80be 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -1652,7 +1652,7 @@ def : InstRW<[Zn4MOVS], (instregex
 
 def Zn4MOVSZ: SchedWriteRes<[Zn4FPFMisc12]> {
   let Latency = 4;
-  let ReleaseAtCycles = [4];
+  let ReleaseAtCycles = [2];
   let NumMicroOps = 1;
 }
 def : InstRW<[Zn4MOVSZ], (instregex
@@ -1661,7 +1661,7 @@ def : InstRW<[Zn4MOVSZ], (instregex
 
 def Zn4MOVSrr: SchedWriteRes<[Zn4FPFMisc12]> {
   let Latency = 5;
-  let ReleaseAtCycles = [5];
+  let ReleaseAtCycles = [2];
   let NumMicroOps = 1;
 }
 def : InstRW<[Zn4MOVSrr], (instregex
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
index ca0a81e4207d53..72d7de33533467 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
@@ -1736,140 +1736,140 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdd	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherqq	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherqd	(%rax,%zmm1,2), %ymm2 {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovdb	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovdb	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovdb	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovdb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovdb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovdb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovdb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovdw	%zmm19, %ymm16
+# CHECK-NEXT:  1      5     1.00                        vpmovdb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovdw	%zmm19, %ymm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovdw	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovdw	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovdw	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovdw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovdw	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovqb	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovdw	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovqb	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovqb	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovqb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovqb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovqb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovqb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovqd	%zmm19, %ymm16
+# CHECK-NEXT:  1      5     1.00                        vpmovqb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovqd	%zmm19, %ymm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovqd	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovqd	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovqd	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovqd	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovqd	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovqw	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovqd	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovqw	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovqw	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovqw	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovqw	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovqw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovqw	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovsdb	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovqw	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovsdb	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovsdb	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovsdb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovsdb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovsdb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovsdb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovsdw	%zmm19, %ymm16
+# CHECK-NEXT:  1      5     1.00                        vpmovsdb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovsdw	%zmm19, %ymm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovsdw	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovsdw	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovsdw	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovsdw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovsdw	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovsqb	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovsdw	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovsqb	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovsqb	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovsqb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovsqb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovsqb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovsqb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovsqd	%zmm19, %ymm16
+# CHECK-NEXT:  1      5     1.00                        vpmovsqb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovsqd	%zmm19, %ymm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovsqd	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovsqd	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovsqd	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovsqd	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovsqd	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovsqw	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovsqd	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovsqw	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovsqw	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovsqw	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovsqw	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovsqw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovsqw	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbd	%xmm16, %zmm19
+# CHECK-NEXT:  1      5     1.00                        vpmovsqw	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbd	%xmm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbd	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbd	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbd	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbd	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbd	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbq	%xmm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbq	%xmm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbq	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxdq	%ymm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovsxdq	%ymm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxdq	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovsxdq	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxdq	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxdq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxdq	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxdq	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxdq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxwd	%ymm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovsxwd	%ymm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxwd	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovsxwd	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxwd	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxwd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxwd	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxwd	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxwd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxwq	%xmm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovsxwq	%xmm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxwq	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovsxwq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxwq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxwq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxwq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxwq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxwq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovusdb	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovusdb	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovusdb	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovusdb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovusdb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovusdb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovusdb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovusdw	%zmm19, %ymm16
+# CHECK-NEXT:  1      5     1.00                        vpmovusdb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovusdw	%zmm19, %ymm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovusdw	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovusdw	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovusdw	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovusdw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovusdw	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovusqb	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovusdw	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovusqb	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovusqb	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovusqb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovusqb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovusqb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovusqb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovusqd	%zmm19, %ymm16
+# CHECK-NEXT:  1      5     1.00                        vpmovusqb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovusqd	%zmm19, %ymm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovusqd	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovusqd	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovusqd	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovusqd	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovusqd	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovusqw	%zmm19, %xmm16
+# CHECK-NEXT:  1      5     1.00                        vpmovusqd	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovusqw	%zmm19, %xmm16
 # CHECK-NEXT:  1      11    1.50           *            vpmovusqw	%zmm19, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovusqw	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovusqw	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovusqw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovusqw	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbd	%xmm16, %zmm19
+# CHECK-NEXT:  1      5     1.00                        vpmovusqw	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbd	%xmm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbd	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbd	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbd	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbd	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbd	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbq	%xmm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbq	%xmm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbq	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxdq	%ymm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovzxdq	%ymm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxdq	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovzxdq	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxdq	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxdq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxdq	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxdq	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxdq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxwd	%ymm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovzxwd	%ymm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxwd	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovzxwd	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxwd	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxwd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxwd	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxwd	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxwd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxwq	%xmm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovzxwq	%xmm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxwq	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovzxwq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxwq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxwq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxwq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxwq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxwq	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                        vpmulld	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      10    1.00    *                   vpmulld	(%rax), %zmm17, %zmm19
@@ -2233,7 +2233,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 5.33   5.33   5.33    -      -      -      -      -     219.50 1216.50 774.00 351.00 312.50 312.50 17.00 215.67 215.67 215.67 204.67 204.67 204.67 16.50  16.50
+# CHECK-NEXT: 5.33   5.33   5.33    -      -      -      -      -     219.50 1119.00 676.50 351.00 312.50 312.50 17.00 215.67 215.67 215.67 204.67 204.67 204.67 16.50  16.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2793,140 +2793,140 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdd	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherqq	(%rax,%zmm1,2), %zmm2 {%k1}
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherqd	(%rax,%zmm1,2), %ymm2 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdb	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdb	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovdb	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovdb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdw	%zmm19, %ymm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdw	%zmm19, %ymm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovdw	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdw	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdw	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovdw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdw	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqb	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovdw	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqb	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovqb	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovqb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqd	%zmm19, %ymm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqd	%zmm19, %ymm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovqd	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqd	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqd	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovqd	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqd	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqw	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqd	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqw	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovqw	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqw	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqw	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovqw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqw	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdb	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovqw	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdb	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsdb	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsdb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdw	%zmm19, %ymm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdw	%zmm19, %ymm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsdw	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdw	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdw	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsdw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdw	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqb	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsdw	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqb	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsqb	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsqb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqd	%zmm19, %ymm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqd	%zmm19, %ymm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsqd	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqd	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqd	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsqd	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqd	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqw	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqd	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqw	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsqw	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqw	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqw	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsqw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqw	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbd	%xmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsqw	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbd	%xmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbd	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbd	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbd	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbd	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbd	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbq	%xmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbq	%xmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbq	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%ymm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%ymm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxdq	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxdq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxdq	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxdq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwd	%ymm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwd	%ymm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxwd	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwd	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwd	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxwd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwd	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwd	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxwd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwq	%xmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwq	%xmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxwq	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxwq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxwq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxwq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdb	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdb	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusdb	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusdb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdw	%zmm19, %ymm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdw	%zmm19, %ymm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusdw	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdw	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdw	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusdw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdw	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqb	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusdw	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqb	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusqb	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqb	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqb	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusqb	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqb	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqd	%zmm19, %ymm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqb	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqd	%zmm19, %ymm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusqd	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqd	%zmm19, %ymm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqd	%zmm19, %ymm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusqd	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqd	%zmm19, %ymm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqw	%zmm19, %xmm16
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqd	%zmm19, %ymm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqw	%zmm19, %xmm16
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusqw	%zmm19, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqw	%zmm19, %xmm16 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqw	%zmm19, %xmm16 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovusqw	%zmm19, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqw	%zmm19, %xmm16 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbd	%xmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovusqw	%zmm19, %xmm16 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbd	%xmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbd	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbd	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbd	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbd	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbd	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbq	%xmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbq	%xmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbq	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxdq	%ymm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxdq	%ymm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxdq	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxdq	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxdq	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxdq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxdq	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxdq	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxdq	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwd	%ymm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwd	%ymm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxwd	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwd	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwd	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxwd	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwd	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwd	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxwd	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwq	%xmm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwq	%xmm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxwq	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwq	%xmm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwq	%xmm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxwq	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwq	%xmm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxwq	%xmm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxwq	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vpmulld	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -     1.00   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmulld	(%rax), %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bw.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bw.s
index 3280ed3b8aab6f..ffdfe3fe98955e 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bw.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512bw.s
@@ -912,32 +912,32 @@ vpunpcklwd        (%rax), %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      1     1.00                        vpmovw2m	%zmm0, %k0
 # CHECK-NEXT:  1      0     0.17                        vpmovm2b	%k0, %zmm0
 # CHECK-NEXT:  1      0     0.17                        vpmovm2w	%k0, %zmm0
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbw	%ymm16, %zmm19
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbw	%ymm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbw	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbw	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbw	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbw	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovsxbw	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovsxbw	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovsxbw	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovswb	%zmm16, %ymm19
+# CHECK-NEXT:  1      5     1.00                        vpmovswb	%zmm16, %ymm19
 # CHECK-NEXT:  1      11    1.50           *            vpmovswb	%zmm16, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovswb	%zmm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovswb	%zmm16, %ymm19 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovswb	%zmm16, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovswb	%zmm16, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovuswb	%zmm16, %ymm19
+# CHECK-NEXT:  1      5     1.00                        vpmovswb	%zmm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovuswb	%zmm16, %ymm19
 # CHECK-NEXT:  1      11    1.50           *            vpmovuswb	%zmm16, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovuswb	%zmm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovuswb	%zmm16, %ymm19 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovuswb	%zmm16, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovuswb	%zmm16, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      5     2.50                        vpmovwb	%zmm16, %ymm19
+# CHECK-NEXT:  1      5     1.00                        vpmovuswb	%zmm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      5     1.00                        vpmovwb	%zmm16, %ymm19
 # CHECK-NEXT:  1      11    1.50           *            vpmovwb	%zmm16, (%rax)
-# CHECK-NEXT:  1      5     2.50                        vpmovwb	%zmm16, %ymm19 {%k1}
+# CHECK-NEXT:  1      5     1.00                        vpmovwb	%zmm16, %ymm19 {%k1}
 # CHECK-NEXT:  1      11    1.50           *            vpmovwb	%zmm16, (%rax) {%k1}
-# CHECK-NEXT:  1      5     2.50                        vpmovwb	%zmm16, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbw	%ymm16, %zmm19
+# CHECK-NEXT:  1      5     1.00                        vpmovwb	%zmm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbw	%ymm16, %zmm19
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbw	(%rax), %zmm19
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbw	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbw	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbw	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  1      4     2.00                        vpmovzxbw	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  1      4     1.00                        vpmovzxbw	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      11    1.50    *                   vpmovzxbw	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      3     1.00                        vpmulhrsw	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      10    1.00    *                   vpmulhrsw	(%rax), %zmm17, %zmm19
@@ -1145,7 +1145,7 @@ vpunpcklwd        (%rax), %zmm17, %zmm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 0.67   0.67   0.67   2.00   2.00   2.00   2.00    -     177.50 322.50 288.00 161.00 123.00 123.00 6.00   77.33  77.33  77.33  75.33  75.33  75.33  3.00   3.00
+# CHECK-NEXT: 0.67   0.67   0.67   2.00   2.00   2.00   2.00    -     177.50 303.00 268.50 161.00 123.00 123.00 6.00   77.33  77.33  77.33  75.33  75.33  75.33  3.00   3.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -1447,32 +1447,32 @@ vpunpcklwd        (%rax), %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  -      -      -     1.00   1.00   1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpmovw2m	%zmm0, %k0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpmovm2b	%k0, %zmm0
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     vpmovm2w	%k0, %zmm0
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbw	%ymm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbw	%ymm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbw	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbw	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbw	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbw	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbw	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovsxbw	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovsxbw	(%rax), %zmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovswb	%zmm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovswb	%zmm16, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovswb	%zmm16, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovswb	%zmm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovswb	%zmm16, %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovswb	%zmm16, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovswb	%zmm16, %ymm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovuswb	%zmm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovswb	%zmm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovuswb	%zmm16, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovuswb	%zmm16, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovuswb	%zmm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovuswb	%zmm16, %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovuswb	%zmm16, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovuswb	%zmm16, %ymm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovwb	%zmm16, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovuswb	%zmm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovwb	%zmm16, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovwb	%zmm16, (%rax)
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovwb	%zmm16, %ymm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovwb	%zmm16, %ymm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovwb	%zmm16, (%rax) {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.50   2.50    -      -      -      -      -      -      -      -      -      -      -      -     vpmovwb	%zmm16, %ymm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbw	%ymm16, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovwb	%zmm16, %ymm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbw	%ymm16, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbw	(%rax), %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbw	%ymm16, %zmm19 {%k1}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbw	%ymm16, %zmm19 {%k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbw	(%rax), %zmm19 {%k1}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbw	%ymm16, %zmm19 {%k1} {z}
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     vpmovzxbw	%ymm16, %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.50   1.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmovzxbw	(%rax), %zmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -     1.00    -      -      -      -      -      -      -      -      -      -      -     vpmulhrsw	%zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00    -      -     1.00   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpmulhrsw	(%rax), %zmm17, %zmm19

From 827ebf84e9af7c93a30daf4ed17e99ccef4cf94a Mon Sep 17 00:00:00 2001
From: c8ef <c8ef@outlook.com>
Date: Tue, 26 Nov 2024 20:31:29 +0800
Subject: [PATCH 04/41] [clang] constexpr built-in elementwise popcount
 function. (#117473)

Part of #51787.

This patch adds constexpr support for the built-in elementwise popcount
function.
---
 clang/docs/LanguageExtensions.rst             |  3 ++
 clang/docs/ReleaseNotes.rst                   |  1 +
 clang/include/clang/Basic/Builtins.td         |  2 +-
 clang/lib/AST/ExprConstant.cpp                | 31 +++++++++++++++++++
 .../test/CodeGen/builtins-elementwise-math.c  | 10 +++---
 clang/test/Sema/constant_builtins_vector.cpp  | 20 ++++++++++++
 6 files changed, 60 insertions(+), 7 deletions(-)

diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 3c9078bcdf8118..c053a5ab3c528c 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -647,6 +647,9 @@ elementwise to the input.
 
 Unless specified otherwise operation(±0) = ±0 and operation(±infinity) = ±infinity
 
+The integer elementwise intrinsics, including ``__builtin_elementwise_popcount``,
+can be called in a ``constexpr`` context.
+
 ============================================== ====================================================================== =========================================
          Name                                   Operation                                                             Supported element types
 ============================================== ====================================================================== =========================================
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index ff24161de493c7..6c2777f01c4808 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -375,6 +375,7 @@ Non-comprehensive list of changes in this release
 - ``__builtin_reduce_mul`` function can now be used in constant expressions.
 - ``__builtin_reduce_and`` function can now be used in constant expressions.
 - ``__builtin_reduce_or`` and ``__builtin_reduce_xor`` functions can now be used in constant expressions.
+- ``__builtin_elementwise_popcount`` function can now be used in constant expressions.
 
 New Compiler Flags
 ------------------
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index eaff744924805e..db5cd73fba8ad1 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1354,7 +1354,7 @@ def ElementwiseLog10 : Builtin {
 
 def ElementwisePopcount : Builtin {
   let Spellings = ["__builtin_elementwise_popcount"];
-  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
   let Prototype = "void(...)";
 }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index c6d003073966f3..bb5ab67328fbc6 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11005,6 +11005,7 @@ namespace {
     bool VisitUnaryImag(const UnaryOperator *E);
     bool VisitBinaryOperator(const BinaryOperator *E);
     bool VisitUnaryOperator(const UnaryOperator *E);
+    bool VisitCallExpr(const CallExpr *E);
     bool VisitConvertVectorExpr(const ConvertVectorExpr *E);
     bool VisitShuffleVectorExpr(const ShuffleVectorExpr *E);
 
@@ -11302,6 +11303,35 @@ static bool handleVectorElementCast(EvalInfo &Info, const FPOptions FPO,
   return false;
 }
 
+bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
+  if (!IsConstantEvaluatedBuiltinCall(E))
+    return ExprEvaluatorBaseTy::VisitCallExpr(E);
+
+  switch (E->getBuiltinCallee()) {
+  default:
+    return false;
+  case Builtin::BI__builtin_elementwise_popcount: {
+    APValue Source;
+    if (!EvaluateAsRValue(Info, E->getArg(0), Source))
+      return false;
+
+    QualType DestEltTy = E->getType()->castAs<VectorType>()->getElementType();
+    unsigned SourceLen = Source.getVectorLength();
+    SmallVector<APValue, 4> ResultElements;
+    ResultElements.reserve(SourceLen);
+
+    for (unsigned EltNum = 0; EltNum < SourceLen; ++EltNum) {
+      APSInt Elt = Source.getVectorElt(EltNum).getInt();
+      ResultElements.push_back(
+          APValue(APSInt(APInt(Info.Ctx.getIntWidth(DestEltTy), Elt.popcount()),
+                         DestEltTy->isUnsignedIntegerOrEnumerationType())));
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+  }
+}
+
 bool VectorExprEvaluator::VisitConvertVectorExpr(const ConvertVectorExpr *E) {
   APValue Source;
   QualType SourceVecType = E->getSrcExpr()->getType();
@@ -13118,6 +13148,7 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
   case Builtin::BI__builtin_popcountl:
   case Builtin::BI__builtin_popcountll:
   case Builtin::BI__builtin_popcountg:
+  case Builtin::BI__builtin_elementwise_popcount:
   case Builtin::BI__popcnt16: // Microsoft variants of popcount
   case Builtin::BI__popcnt:
   case Builtin::BI__popcnt64: {
diff --git a/clang/test/CodeGen/builtins-elementwise-math.c b/clang/test/CodeGen/builtins-elementwise-math.c
index 748b5e4add7c76..f1f34432ca0ea1 100644
--- a/clang/test/CodeGen/builtins-elementwise-math.c
+++ b/clang/test/CodeGen/builtins-elementwise-math.c
@@ -666,11 +666,9 @@ void test_builtin_elementwise_log2(float f1, float f2, double d1, double d2,
   vf2 = __builtin_elementwise_log2(vf1);
 }
 
-void test_builtin_elementwise_popcount(si8 vi1, si8 vi2,
-                                  long long int i1, long long int i2, short si,
-                                  _BitInt(31) bi1, _BitInt(31) bi2) {
-
-  
+void test_builtin_elementwise_popcount(si8 vi1, si8 vi2, long long int i1,
+                                       long long int i2, short si,
+                                       _BitInt(31) bi1, _BitInt(31) bi2) {
   // CHECK:      [[I1:%.+]] = load i64, ptr %i1.addr, align 8
   // CHECK-NEXT: call i64 @llvm.ctpop.i64(i64 [[I1]])
   i2 = __builtin_elementwise_popcount(i1);
@@ -693,7 +691,7 @@ void test_builtin_elementwise_popcount(si8 vi1, si8 vi2,
   // CHECK-NEXT: call i32 @llvm.ctpop.i32(i32 [[IA1]])
   b = __builtin_elementwise_popcount(int_as_one);
 
-  // CHECK:   call i32 @llvm.ctpop.i32(i32 -10)
+  // CHECK:      store i32 30, ptr @b, align 4
   b = __builtin_elementwise_popcount(-10);
 
   // CHECK:      [[SI:%.+]] = load i16, ptr %si.addr, align 2
diff --git a/clang/test/Sema/constant_builtins_vector.cpp b/clang/test/Sema/constant_builtins_vector.cpp
index e84d09b24672b4..772a682141ce41 100644
--- a/clang/test/Sema/constant_builtins_vector.cpp
+++ b/clang/test/Sema/constant_builtins_vector.cpp
@@ -797,3 +797,23 @@ static_assert(__builtin_reduce_xor((vector4int){(int)0x11111111, (int)0x22222222
 static_assert(__builtin_reduce_xor((vector4long){(long long)0x1111111111111111L, (long long)0x2222222222222222L, (long long)0x4444444444444444L, (long long)0x8888888888888888L}) == (long long)0xFFFFFFFFFFFFFFFFL);
 static_assert(__builtin_reduce_xor((vector4uint){0x11111111U, 0x22222222U, 0x44444444U, 0x88888888U}) == 0xFFFFFFFFU);
 static_assert(__builtin_reduce_xor((vector4ulong){0x1111111111111111UL, 0x2222222222222222UL, 0x4444444444444444UL, 0x8888888888888888UL}) == 0xFFFFFFFFFFFFFFFFUL);
+
+static_assert(__builtin_bit_cast(unsigned, __builtin_elementwise_popcount((vector4char){1, 2, 3, 4})) == (LITTLE_END ? 0x01020101 : 0x01010201));
+static_assert(__builtin_bit_cast(unsigned long long, __builtin_elementwise_popcount((vector4short){0, 0x0F0F, ~0, ~0x0F0F})) == (LITTLE_END ? 0x0008001000080000 : 0x0000000800100008));
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4int){1, 2, 3, 4})) == 5);
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4int){0, 0xF0F0, ~0, ~0xF0F0})) == 16 * sizeof(int));
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4long){1L, 2L, 3L, 4L})) == 5L);
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4long){0L, 0xF0F0L, ~0L, ~0xF0F0L})) == 16 * sizeof(long long));
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4uint){1U, 2U, 3U, 4U})) == 5U);
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4uint){0U, 0xF0F0U, ~0U, ~0xF0F0U})) == 16 * sizeof(int));
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4ulong){1UL, 2UL, 3UL, 4UL})) == 5UL);
+static_assert(__builtin_reduce_add(__builtin_elementwise_popcount((vector4ulong){0ULL, 0xF0F0ULL, ~0ULL, ~0xF0F0ULL})) == 16 * sizeof(unsigned long long));
+static_assert(__builtin_elementwise_popcount(0) == 0);
+static_assert(__builtin_elementwise_popcount(0xF0F0) == 8);
+static_assert(__builtin_elementwise_popcount(~0) == 8 * sizeof(int));
+static_assert(__builtin_elementwise_popcount(0U) == 0);
+static_assert(__builtin_elementwise_popcount(0xF0F0U) == 8);
+static_assert(__builtin_elementwise_popcount(~0U) == 8 * sizeof(int));
+static_assert(__builtin_elementwise_popcount(0L) == 0);
+static_assert(__builtin_elementwise_popcount(0xF0F0L) == 8);
+static_assert(__builtin_elementwise_popcount(~0LL) == 8 * sizeof(long long));

From f94bd3c933076500b9291009d8cb5e1139c52a06 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Tue, 26 Nov 2024 13:45:47 +0100
Subject: [PATCH 05/41] Revert "[RISCV] Implement tail call optimization in
 machine outliner" (#117710)

Reverts llvm/llvm-project#115297
Bots are broken
---
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   6 -
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |   6 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 143 ++++--------------
 .../CodeGen/RISCV/machine-outliner-call.ll    |  70 ---------
 .../CodeGen/RISCV/machine-outliner-cfi.mir    |  22 ++-
 .../machine-outliner-leaf-descendants.ll      |  13 +-
 .../RISCV/machine-outliner-patchable.ll       |  24 +--
 .../RISCV/machine-outliner-position.mir       |  21 +--
 .../test/CodeGen/RISCV/machineoutliner-x5.mir |  54 -------
 llvm/test/CodeGen/RISCV/machineoutliner.mir   |  18 +--
 10 files changed, 79 insertions(+), 298 deletions(-)
 delete mode 100644 llvm/test/CodeGen/RISCV/machine-outliner-call.ll
 delete mode 100644 llvm/test/CodeGen/RISCV/machineoutliner-x5.mir

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index ca2f868cd4e764..19103e219cb800 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -208,12 +208,6 @@ static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
   return Desc.getNumOperands() - Offset;
 }
 
-static inline unsigned getTailExpandUseRegNo(const FeatureBitset &FeatureBits) {
-  // For Zicfilp, PseudoTAIL should be expanded to a software guarded branch.
-  // It means to use t2(x7) as rs1 of JALR to expand PseudoTAIL.
-  return FeatureBits[RISCV::FeatureStdExtZicfilp] ? RISCV::X7 : RISCV::X6;
-}
-
 static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   assert(hasSEWOp(TSFlags));
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index a28bf1186589d9..6bb49e2bb85fe1 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -124,7 +124,11 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI,
   MCRegister Ra;
   if (MI.getOpcode() == RISCV::PseudoTAIL) {
     Func = MI.getOperand(0);
-    Ra = RISCVII::getTailExpandUseRegNo(STI.getFeatureBits());
+    Ra = RISCV::X6;
+    // For Zicfilp, PseudoTAIL should be expanded to a software guarded branch.
+    // It means to use t2(x7) as rs1 of JALR to expand PseudoTAIL.
+    if (STI.hasFeature(RISCV::FeatureStdExtZicfilp))
+      Ra = RISCV::X7;
   } else if (MI.getOpcode() == RISCV::PseudoCALLReg) {
     Func = MI.getOperand(1);
     Ra = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 47273d6bc06d65..933e776da47404 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -11,7 +11,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVInstrInfo.h"
-#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "RISCV.h"
 #include "RISCVMachineFunctionInfo.h"
@@ -2928,7 +2927,6 @@ bool RISCVInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
 
 // Enum values indicating how an outlined call should be constructed.
 enum MachineOutlinerConstructionID {
-  MachineOutlinerTailCall,
   MachineOutlinerDefault
 };
 
@@ -2937,118 +2935,46 @@ bool RISCVInstrInfo::shouldOutlineFromFunctionByDefault(
   return MF.getFunction().hasMinSize();
 }
 
-static bool isCandidatePatchable(const MachineBasicBlock &MBB) {
-  const MachineFunction *MF = MBB.getParent();
-  const Function &F = MF->getFunction();
-  return F.getFnAttribute("fentry-call").getValueAsBool() ||
-         F.hasFnAttribute("patchable-function-entry");
-}
-
-static bool isMIReadsReg(const MachineInstr &MI, const TargetRegisterInfo *TRI,
-                         unsigned RegNo) {
-  return MI.readsRegister(RegNo, TRI) ||
-         MI.getDesc().hasImplicitUseOfPhysReg(RegNo);
-}
-
-static bool isMIModifiesReg(const MachineInstr &MI,
-                            const TargetRegisterInfo *TRI, unsigned RegNo) {
-  return MI.modifiesRegister(RegNo, TRI) ||
-         MI.getDesc().hasImplicitDefOfPhysReg(RegNo);
-}
-
-static bool cannotInsertTailCall(const MachineBasicBlock &MBB) {
-  if (!MBB.back().isReturn())
-    return true;
-  if (isCandidatePatchable(MBB))
-    return true;
-
-  // If the candidate reads the pre-set register
-  // that can be used for expanding PseudoTAIL instruction,
-  // then we cannot insert tail call.
-  const TargetSubtargetInfo &STI = MBB.getParent()->getSubtarget();
-  unsigned TailExpandUseRegNo =
-      RISCVII::getTailExpandUseRegNo(STI.getFeatureBits());
-  for (const MachineInstr &MI : MBB) {
-    if (isMIReadsReg(MI, STI.getRegisterInfo(), TailExpandUseRegNo))
-      return true;
-    if (isMIModifiesReg(MI, STI.getRegisterInfo(), TailExpandUseRegNo))
-      break;
-  }
-  return false;
-}
-
-static std::optional<MachineOutlinerConstructionID>
-analyzeCandidate(outliner::Candidate &C) {
-  // If last instruction is return then we can rely on
-  // the verification already performed in the getOutliningTypeImpl.
-  if (C.back().isReturn()) {
-    assert(!cannotInsertTailCall(*C.getMBB()) &&
-           "The candidate who uses return instruction must be outlined "
-           "using tail call");
-    return MachineOutlinerTailCall;
-  }
-
-  auto CandidateUsesX5 = [](outliner::Candidate &C) {
-    const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();
-    if (std::any_of(C.begin(), C.end(), [TRI](const MachineInstr &MI) {
-          return isMIModifiesReg(MI, TRI, RISCV::X5);
-        }))
-      return true;
-    return !C.isAvailableAcrossAndOutOfSeq(RISCV::X5, *TRI);
-  };
-
-  if (!CandidateUsesX5(C))
-    return MachineOutlinerDefault;
-
-  return std::nullopt;
-}
-
 std::optional<std::unique_ptr<outliner::OutlinedFunction>>
 RISCVInstrInfo::getOutliningCandidateInfo(
     const MachineModuleInfo &MMI,
     std::vector<outliner::Candidate> &RepeatedSequenceLocs,
     unsigned MinRepeats) const {
 
-  // Each RepeatedSequenceLoc is identical.
-  outliner::Candidate &Candidate = RepeatedSequenceLocs[0];
-  auto CandidateInfo = analyzeCandidate(Candidate);
-  if (!CandidateInfo)
-    RepeatedSequenceLocs.clear();
+  // First we need to filter out candidates where the X5 register (IE t0) can't
+  // be used to setup the function call.
+  auto CannotInsertCall = [](outliner::Candidate &C) {
+    const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();
+    return !C.isAvailableAcrossAndOutOfSeq(RISCV::X5, *TRI);
+  };
+
+  llvm::erase_if(RepeatedSequenceLocs, CannotInsertCall);
 
   // If the sequence doesn't have enough candidates left, then we're done.
   if (RepeatedSequenceLocs.size() < MinRepeats)
     return std::nullopt;
 
-  unsigned InstrSizeCExt =
-      Candidate.getMF()->getSubtarget<RISCVSubtarget>().hasStdExtCOrZca() ? 2
-                                                                          : 4;
-  unsigned CallOverhead = 0, FrameOverhead = 0;
-
-  MachineOutlinerConstructionID MOCI = CandidateInfo.value();
-  switch (MOCI) {
-  case MachineOutlinerDefault:
-    // call t0, function = 8 bytes.
-    CallOverhead = 8;
-    // jr t0 = 4 bytes, 2 bytes if compressed instructions are enabled.
-    FrameOverhead = InstrSizeCExt;
-    break;
-  case MachineOutlinerTailCall:
-    // tail call = auipc + jalr in the worst case without linker relaxation.
-    CallOverhead = 4 + InstrSizeCExt;
-    // Using tail call we move ret instruction from caller to callee.
-    FrameOverhead = 0;
-    break;
-  }
+  unsigned SequenceSize = 0;
+
+  for (auto &MI : RepeatedSequenceLocs[0])
+    SequenceSize += getInstSizeInBytes(MI);
 
+  // call t0, function = 8 bytes.
+  unsigned CallOverhead = 8;
   for (auto &C : RepeatedSequenceLocs)
-    C.setCallInfo(MOCI, CallOverhead);
+    C.setCallInfo(MachineOutlinerDefault, CallOverhead);
 
-  unsigned SequenceSize = 0;
-  for (auto &MI : Candidate)
-    SequenceSize += getInstSizeInBytes(MI);
+  // jr t0 = 4 bytes, 2 bytes if compressed instructions are enabled.
+  unsigned FrameOverhead = 4;
+  if (RepeatedSequenceLocs[0]
+          .getMF()
+          ->getSubtarget<RISCVSubtarget>()
+          .hasStdExtCOrZca())
+    FrameOverhead = 2;
 
   return std::make_unique<outliner::OutlinedFunction>(
-      RepeatedSequenceLocs, SequenceSize, FrameOverhead, MOCI);
+      RepeatedSequenceLocs, SequenceSize, FrameOverhead,
+      MachineOutlinerDefault);
 }
 
 outliner::InstrType
@@ -3069,8 +2995,15 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
     return F.needsUnwindTableEntry() ? outliner::InstrType::Illegal
                                      : outliner::InstrType::Invisible;
 
-  if (cannotInsertTailCall(*MBB) &&
-      (MI.isReturn() || isMIModifiesReg(MI, TRI, RISCV::X5)))
+  // We need support for tail calls to outlined functions before return
+  // statements can be allowed.
+  if (MI.isReturn())
+    return outliner::InstrType::Illegal;
+
+  // Don't allow modifying the X5 register which we use for return addresses for
+  // these outlined functions.
+  if (MI.modifiesRegister(RISCV::X5, TRI) ||
+      MI.getDesc().hasImplicitDefOfPhysReg(RISCV::X5))
     return outliner::InstrType::Illegal;
 
   // Make sure the operands don't reference something unsafe.
@@ -3106,9 +3039,6 @@ void RISCVInstrInfo::buildOutlinedFrame(
     }
   }
 
-  if (OF.FrameConstructionID == MachineOutlinerTailCall)
-    return;
-
   MBB.addLiveIn(RISCV::X5);
 
   // Add in a return instruction to the end of the outlined frame.
@@ -3122,13 +3052,6 @@ MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall(
     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
     MachineFunction &MF, outliner::Candidate &C) const {
 
-  if (C.CallConstructionID == MachineOutlinerTailCall) {
-    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(RISCV::PseudoTAIL))
-                            .addGlobalAddress(M.getNamedValue(MF.getName()),
-                                              /*Offset=*/0, RISCVII::MO_CALL));
-    return It;
-  }
-
   // Add in a call instruction to the outlined function at the given location.
   It = MBB.insert(It,
                   BuildMI(MF, DebugLoc(), get(RISCV::PseudoCALLReg), RISCV::X5)
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-call.ll b/llvm/test/CodeGen/RISCV/machine-outliner-call.ll
deleted file mode 100644
index b019cfe74864b0..00000000000000
--- a/llvm/test/CodeGen/RISCV/machine-outliner-call.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -enable-machine-outliner | FileCheck %s
-
-target triple = "riscv64-unknown-linux-gnu"
-
-declare void @foo(i32, i32, i32, i32) minsize
-
-define void @fentry0(i1 %a) nounwind {
-; CHECK-LABEL: fentry0:
-; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
-; CHECK-NEXT:    call foo
-; CHECK-LABEL: .LBB0_2:
-; CHECK-NEXT:    tail OUTLINED_FUNCTION_[[BB2:[0-9]+]]
-entry:
-  br i1 %a, label %if.then, label %if.end
-if.then:
-  call void @foo(i32 1, i32 2, i32 3, i32 4)
-  br label %if.end
-if.end:
-  call void @foo(i32 5, i32 6, i32 7, i32 8)
-  ret void
-}
-
-define void @fentry1(i1 %a) nounwind {
-; CHECK-LABEL: fentry1:
-; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
-; CHECK-NEXT:    call foo
-; CHECK-LABEL: .LBB1_2:
-; CHECK-NEXT:    tail OUTLINED_FUNCTION_[[BB2:[0-9]+]]
-entry:
-  br i1 %a, label %if.then, label %if.end
-if.then:
-  call void @foo(i32 1, i32 2, i32 3, i32 4)
-  br label %if.end
-if.end:
-  call void @foo(i32 5, i32 6, i32 7, i32 8)
-  ret void
-}
-
-define void @fentry2(i1 %a) nounwind {
-; CHECK-LABEL: fentry2:
-; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
-; CHECK-NEXT:    call foo
-; CHECK-LABEL: .LBB2_2:
-; CHECK-NEXT:    tail OUTLINED_FUNCTION_[[BB2:[0-9]+]]
-entry:
-  br i1 %a, label %if.then, label %if.end
-if.then:
-  call void @foo(i32 1, i32 2, i32 3, i32 4)
-  br label %if.end
-if.end:
-  call void @foo(i32 5, i32 6, i32 7, i32 8)
-  ret void
-}
-
-; CHECK:       OUTLINED_FUNCTION_[[BB2]]:
-; CHECK:       li      a0, 5
-; CHECK-NEXT:  li      a1, 6
-; CHECK-NEXT:  li      a2, 7
-; CHECK-NEXT:  li      a3, 8
-; CHECK-NEXT:  call foo
-
-; CHECK:       OUTLINED_FUNCTION_[[BB1]]:
-; CHECK:       li      a0, 1
-; CHECK-NEXT:  li      a1, 2
-; CHECK-NEXT:  li      a2, 3
-; CHECK-NEXT:  li      a3, 4
-; CHECK-NEXT:  jr      t0
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
index 2acb1d43e01eaf..6ecca6a1b18ef8 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
@@ -22,11 +22,13 @@ body:             |
     ; RV32I-MO-LABEL: name: func1
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
-    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET
     ; RV64I-MO-LABEL: name: func1
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
-    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET
     $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, 0
     $x11 = ORI $x11, 1023
@@ -47,11 +49,13 @@ body:             |
     ; RV32I-MO-LABEL: name: func2
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
-    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET
     ; RV64I-MO-LABEL: name: func2
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
-    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET
     $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, 0
     $x11 = ORI $x11, 1023
@@ -72,11 +76,13 @@ body:             |
     ; RV32I-MO-LABEL: name: func3
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
-    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET
     ; RV64I-MO-LABEL: name: func3
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
-    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET
     $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, -12
     $x11 = ORI $x11, 1023
@@ -90,11 +96,11 @@ body:             |
 
 
 # OUTLINED-LABEL: name: OUTLINED_FUNCTION_0
-# OUTLINED: liveins: $x11, $x10
+# OUTLINED: liveins: $x11, $x10, $x5
 # OUTLINED-NEXT: {{  $}}
 # OUTLINED-NEXT: $x10 = ORI $x10, 1023
 # OUTLINED-NEXT: $x11 = ORI $x11, 1023
 # OUTLINED-NEXT: $x12 = ADDI $x10, 17
 # OUTLINED-NEXT: $x11 = AND $x12, $x11
 # OUTLINED-NEXT: $x10 = SUB $x10, $x11
-# OUTLINED-NEXT: PseudoRET
+# OUTLINED-NEXT: $x0 = JALR $x5, 0
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll b/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll
index 0441361b117989..8fab0aa9b6a76c 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll
@@ -94,8 +94,7 @@ define i32 @_Z2f6v() minsize {
 ; CHECK-BASELINE-NEXT:	li	a3, 0x4
 ; CHECK-BASELINE-NEXT:	li	a4, 0x5
 ; CHECK-BASELINE-NEXT:	li	a5, 0x6
-; CHECK-BASELINE-NEXT:	auipc	t1, 0x0
-; CHECK-BASELINE-NEXT:	jr	t1
+; CHECK-BASELINE-NEXT:	jr	t0
 
 ; CHECK-BASELINE: <OUTLINED_FUNCTION_1>:
 ; CHECK-BASELINE-NEXT:	li	a0, 0x1
@@ -103,9 +102,8 @@ define i32 @_Z2f6v() minsize {
 ; CHECK-BASELINE-NEXT:	li	a2, 0x3
 ; CHECK-BASELINE-NEXT:	li	a3, 0x4
 ; CHECK-BASELINE-NEXT:	li	a4, 0x5
-; CHECK-BASELINE-NEXT:	li	a5, 0x8
-; CHECK-BASELINE-NEXT:	auipc	t1, 0x0
-; CHECK-BASELINE-NEXT:	jr	t1
+; CHECK-BASELINE-NEXT:	li	a5, 0x7
+; CHECK-BASELINE-NEXT:	jr	t0
 
 ; CHECK-BASELINE: <OUTLINED_FUNCTION_2>:
 ; CHECK-BASELINE-NEXT:	li	a0, 0x1
@@ -113,9 +111,8 @@ define i32 @_Z2f6v() minsize {
 ; CHECK-BASELINE-NEXT:	li	a2, 0x3
 ; CHECK-BASELINE-NEXT:	li	a3, 0x4
 ; CHECK-BASELINE-NEXT:	li	a4, 0x5
-; CHECK-BASELINE-NEXT:	li	a5, 0x7
-; CHECK-BASELINE-NEXT:	auipc	t1, 0x0
-; CHECK-BASELINE-NEXT:	jr	t1
+; CHECK-BASELINE-NEXT:	li	a5, 0x8
+; CHECK-BASELINE-NEXT:	jr	t0
 
 ; CHECK-LEAF-DESCENDANTS: <OUTLINED_FUNCTION_0>:
 ; CHECK-LEAF-DESCENDANTS-NEXT:	li	a0, 0x1
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll b/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll
index 4a54a7289ddf27..4ef3abd241577f 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll
@@ -11,11 +11,7 @@ define void @fentry0(i1 %a) nounwind "fentry-call"="true" {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    # FEntry call
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
-; CHECK-NEXT:    call foo
-; CHECK-LABEL: .LBB0_2:
-; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
-; CHECK-NEXT:    call	foo
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
@@ -31,11 +27,7 @@ define void @fentry1(i1 %a) nounwind "fentry-call"="true" {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    # FEntry call
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
-; CHECK-NEXT:    call foo
-; CHECK-LABEL: .LBB1_2:
-; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
-; CHECK-NEXT:    call	foo
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
@@ -55,11 +47,7 @@ define void @patchable0(i1 %a) nounwind "patchable-function-entry"="2" {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
-; CHECK-NEXT:    call foo
-; CHECK-LABEL: .LBB2_2:
-; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
-; CHECK-NEXT:    call	foo
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
@@ -77,11 +65,7 @@ define void @patchable1(i1 %a) nounwind "patchable-function-entry"="2" {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
-; CHECK-NEXT:    call foo
-; CHECK-LABEL: .LBB3_2:
-; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
-; CHECK-NEXT:    call	foo
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-position.mir b/llvm/test/CodeGen/RISCV/machine-outliner-position.mir
index 47ec447f61d09c..715e212eecabb3 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-position.mir
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-position.mir
@@ -25,14 +25,15 @@ body:             |
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV32I-MO-NEXT: EH_LABEL <mcsymbol .Ltmp0>
-    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
-    ;
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET
     ; RV64I-MO-LABEL: name: func1
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV64I-MO-NEXT: EH_LABEL <mcsymbol .Ltmp0>
-    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET
     $x10 = ORI $x10, 1023
     EH_LABEL <mcsymbol .Ltmp0>
     $x11 = ORI $x11, 1023
@@ -52,14 +53,15 @@ body:             |
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV32I-MO-NEXT: GC_LABEL <mcsymbol .Ltmp1>
-    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
-    ;
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET
     ; RV64I-MO-LABEL: name: func2
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV64I-MO-NEXT: GC_LABEL <mcsymbol .Ltmp1>
-    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET
     $x10 = ORI $x10, 1023
     GC_LABEL <mcsymbol .Ltmp1>
     $x11 = ORI $x11, 1023
@@ -79,14 +81,15 @@ body:             |
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV32I-MO-NEXT: ANNOTATION_LABEL <mcsymbol .Ltmp2>
-    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
-    ;
+    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV32I-MO-NEXT: PseudoRET
     ; RV64I-MO-LABEL: name: func3
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV64I-MO-NEXT: ANNOTATION_LABEL <mcsymbol .Ltmp2>
-    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
+    ; RV64I-MO-NEXT: PseudoRET
     $x10 = ORI $x10, 1023
     ANNOTATION_LABEL <mcsymbol .Ltmp2>
     $x11 = ORI $x11, 1023
diff --git a/llvm/test/CodeGen/RISCV/machineoutliner-x5.mir b/llvm/test/CodeGen/RISCV/machineoutliner-x5.mir
deleted file mode 100644
index 14a9cd7f0dcffa..00000000000000
--- a/llvm/test/CodeGen/RISCV/machineoutliner-x5.mir
+++ /dev/null
@@ -1,54 +0,0 @@
-# Check that modifying X5 register is not a problem for machine outliner
-
-# RUN: llc -mtriple=riscv32 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \
-# RUN: | FileCheck -check-prefixes=CHECK,RV32I-MO %s
-# RUN: llc -mtriple=riscv64 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \
-# RUN: | FileCheck -check-prefixes=CHECK,RV64I-MO %s
-
---- |
-  define i32 @outline_tail_1(i32 %a, i32 %b) { ret i32 0 }
-
-  define i32 @outline_tail_2(i32 %a, i32 %b) { ret i32 0 }
-...
----
-name:            outline_tail_1
-tracksRegLiveness: true
-isOutlined: false
-body:             |
-  bb.0:
-    liveins: $x10, $x11, $x5
-    ; RV32I-MO-LABEL: name: outline_tail_1
-    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
-    ;
-    ; RV64I-MO-LABEL: name: outline_tail_1
-    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
-    $x11 = ORI $x11, 1023
-    $x12 = ADDI $x10, 17
-    $x10 = ADD $x10, $x5
-    $x11 = AND $x12, $x11
-    $x10 = SUB $x10, $x11
-    PseudoRET implicit $x10
-...
----
-name:            outline_tail_2
-tracksRegLiveness: true
-isOutlined: false
-body:             |
-  bb.0:
-    liveins: $x10, $x11, $x5
-    ; RV32I-MO-LABEL: name: outline_tail_2
-    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
-    ;
-    ; RV64I-MO-LABEL: name: outline_tail_2
-    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
-    $x11 = ORI $x11, 1023
-    $x12 = ADDI $x10, 17
-    $x10 = ADD $x10, $x5
-    $x11 = AND $x12, $x11
-    $x10 = SUB $x10, $x11
-    PseudoRET implicit $x10
-...
-
-
-# CHECK-LABEL: name: OUTLINED_FUNCTION_0
-# CHECK: isOutlined: true
diff --git a/llvm/test/CodeGen/RISCV/machineoutliner.mir b/llvm/test/CodeGen/RISCV/machineoutliner.mir
index ab12bfbe1fafc4..0221257354fcfa 100644
--- a/llvm/test/CodeGen/RISCV/machineoutliner.mir
+++ b/llvm/test/CodeGen/RISCV/machineoutliner.mir
@@ -29,10 +29,10 @@ body:             |
   bb.0:
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: outline_0
-    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV32I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
     ;
     ; RV64I-MO-LABEL: name: outline_0
-    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -48,10 +48,10 @@ body:             |
   bb.0:
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: outline_1
-    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV32I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
     ;
     ; RV64I-MO-LABEL: name: outline_1
-    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -67,10 +67,10 @@ body:             |
   bb.0:
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: outline_2
-    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV32I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
     ;
     ; RV64I-MO-LABEL: name: outline_2
-    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ; RV64I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -87,11 +87,9 @@ body:             |
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: dont_outline_0
     ; RV32I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
-    ; RV32I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: dont_outline_0
     ; RV64I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
-    ; RV64I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -108,11 +106,9 @@ body:             |
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: dont_outline_1
     ; RV32I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
-    ; RV32I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: dont_outline_1
     ; RV64I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
-    ; RV64I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -129,11 +125,9 @@ body:             |
     liveins: $x10, $x11, $x5
     ; RV32I-MO-LABEL: name: dont_outline_2
     ; RV32I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
-    ; RV32I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: dont_outline_2
     ; RV64I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
-    ; RV64I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11

From 6f5e5b630559f2d17bdccfab5dff3a97ac0f8c66 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 26 Nov 2024 13:01:02 +0000
Subject: [PATCH 06/41] [mlir][unittest][nfc] Simplify `getInversePermutation`
 (#117698)

---
 mlir/unittests/IR/AffineMapTest.cpp | 34 ++++++++++-------------------
 1 file changed, 12 insertions(+), 22 deletions(-)

diff --git a/mlir/unittests/IR/AffineMapTest.cpp b/mlir/unittests/IR/AffineMapTest.cpp
index 166692f731d1cf..ff1f28235d4093 100644
--- a/mlir/unittests/IR/AffineMapTest.cpp
+++ b/mlir/unittests/IR/AffineMapTest.cpp
@@ -97,17 +97,12 @@ TEST(AffineMapTest, getInversePermutation) {
   auto resultsInv1 = inverseMap1.getResults();
   EXPECT_EQ(resultsInv1.size(), 3UL);
 
-  // 1.1 Expect d2
-  AffineDimExpr expr = llvm::dyn_cast<AffineDimExpr>(resultsInv1[0]);
-  EXPECT_TRUE(expr && expr.getPosition() == 2);
-
-  // 1.2 Expect d0
-  expr = llvm::dyn_cast<AffineDimExpr>(resultsInv1[1]);
-  EXPECT_TRUE(expr && expr.getPosition() == 0);
-
-  // 1.3 Expect d3
-  expr = llvm::dyn_cast<AffineDimExpr>(resultsInv1[2]);
-  EXPECT_TRUE(expr && expr.getPosition() == 3);
+  // Expect (d2, d0, d3)
+  SmallVector<unsigned> expected = {2, 0, 3};
+  for (auto [idx, res] : llvm::enumerate(resultsInv1)) {
+    AffineDimExpr expr = llvm::dyn_cast<AffineDimExpr>(res);
+    EXPECT_TRUE(expr && expr.getPosition() == expected[idx]);
+  }
 
   // 2.   (d0, d1, d2) -> (d1, d0 + d1, d0, d2, d1, d2, d1, d0)
   auto sum = d0 + d1;
@@ -118,15 +113,10 @@ TEST(AffineMapTest, getInversePermutation) {
   auto resultsInv2 = inverseMap2.getResults();
   EXPECT_EQ(resultsInv2.size(), 3UL);
 
-  // 2.1 Expect d2
-  expr = llvm::dyn_cast<AffineDimExpr>(resultsInv2[0]);
-  EXPECT_TRUE(expr && expr.getPosition() == 2);
-
-  // 2.2 Expect d0
-  expr = llvm::dyn_cast<AffineDimExpr>(resultsInv2[1]);
-  EXPECT_TRUE(expr && expr.getPosition() == 0);
-
-  // 2.3 Expect d3
-  expr = llvm::dyn_cast<AffineDimExpr>(resultsInv2[2]);
-  EXPECT_TRUE(expr && expr.getPosition() == 3);
+  // Expect (d2, d0, d3)
+  expected = {2, 0, 3};
+  for (auto [idx, res] : llvm::enumerate(resultsInv2)) {
+    AffineDimExpr expr = llvm::dyn_cast<AffineDimExpr>(res);
+    EXPECT_TRUE(expr && expr.getPosition() == expected[idx]);
+  }
 }

From 5322415f92fe44a9dac29c95da5ed434efbbba7e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 26 Nov 2024 14:34:30 +0100
Subject: [PATCH 07/41] [PowerPC] Use getSignedConstant() in
 SelectOptimalAddrMode()

All of these immediates are signed, as the surrounding comments
indicate. This fixes an assertion failure in
CodeGen/Generic/dag-combine-ossfuzz-crash.ll when run with a
powerpc-aix triple.
---
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index f4d3668726164b..5451035b500846 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18928,7 +18928,7 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
       SDValue Op1 = N.getOperand(1);
       int16_t Imm = Op1->getAsZExtVal();
       if (!Align || isAligned(*Align, Imm)) {
-        Disp = DAG.getTargetConstant(Imm, DL, N.getValueType());
+        Disp = DAG.getSignedTargetConstant(Imm, DL, N.getValueType());
         Base = Op0;
         if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op0)) {
           Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -18959,7 +18959,7 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
       // this as "d, 0".
       int16_t Imm;
       if (isIntS16Immediate(CN, Imm) && (!Align || isAligned(*Align, Imm))) {
-        Disp = DAG.getTargetConstant(Imm, DL, CNType);
+        Disp = DAG.getSignedTargetConstant(Imm, DL, CNType);
         Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                                CNType);
         break;
@@ -18992,14 +18992,14 @@ PPC::AddrMode PPCTargetLowering::SelectOptimalAddrMode(const SDNode *Parent,
     if (((Opcode == ISD::ADD) || (Opcode == ISD::OR)) &&
         (isIntS34Immediate(N.getOperand(1), Imm34))) {
       // N is an Add/OR Node, and it's operand is a 34-bit signed immediate.
-      Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
+      Disp = DAG.getSignedTargetConstant(Imm34, DL, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0)))
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
       else
         Base = N.getOperand(0);
     } else if (isIntS34Immediate(N, Imm34)) {
       // The address is a 34-bit signed immediate.
-      Disp = DAG.getTargetConstant(Imm34, DL, N.getValueType());
+      Disp = DAG.getSignedTargetConstant(Imm34, DL, N.getValueType());
       Base = DAG.getRegister(PPC::ZERO8, N.getValueType());
     }
     break;

From eb5d69c9ab5e817aaff967d06c8f358c0844b6b8 Mon Sep 17 00:00:00 2001
From: "Oleksandr T." <oleksandr.tarasiuk@outlook.com>
Date: Tue, 26 Nov 2024 15:41:18 +0200
Subject: [PATCH 08/41] [Clang] use begin member expr location for call expr
 with deducing this (#117345)

Fixes #116928
---
 clang/docs/ReleaseNotes.rst                     |  2 ++
 clang/lib/Sema/SemaOverload.cpp                 |  2 +-
 clang/test/AST/ast-dump-cxx2b-deducing-this.cpp | 15 +++++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/AST/ast-dump-cxx2b-deducing-this.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 6c2777f01c4808..4f4b2ce03cd5c3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -732,6 +732,8 @@ Bug Fixes to AST Handling
   sometimes incorrectly return null even if a comment was present. (#GH108145)
 - Clang now correctly parses the argument of the ``relates``, ``related``, ``relatesalso``,
   and ``relatedalso`` comment commands.
+- Clang now uses the location of the begin of the member expression for ``CallExpr``
+  involving deduced ``this``. (#GH116928)
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index e4bf9aa521224b..4c9e37bd286dee 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -15565,7 +15565,7 @@ ExprResult Sema::BuildCallToMemberFunction(Scope *S, Expr *MemExprE,
     // Build the actual expression node.
     ExprResult FnExpr =
         CreateFunctionRefExpr(*this, Method, FoundDecl, MemExpr,
-                              HadMultipleCandidates, MemExpr->getExprLoc());
+                              HadMultipleCandidates, MemExpr->getBeginLoc());
     if (FnExpr.isInvalid())
       return ExprError();
 
diff --git a/clang/test/AST/ast-dump-cxx2b-deducing-this.cpp b/clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
new file mode 100644
index 00000000000000..04cff07376885a
--- /dev/null
+++ b/clang/test/AST/ast-dump-cxx2b-deducing-this.cpp
@@ -0,0 +1,15 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -std=c++2b -ast-dump %s | FileCheck -strict-whitespace %s
+
+namespace GH116928 {
+struct S {
+  int f(this S&);
+};
+
+int main() {
+  S s;
+  int x = s.f();
+  // CHECK: CallExpr 0x{{[^ ]*}} <col:11, col:15> 'int
+  // CHECK-NEXT: |-ImplicitCastExpr 0x{{[^ ]*}} <col:11> 'int (*)(S &)' <FunctionToPointerDecay>
+  // CHECK-NEXT: | `-DeclRefExpr 0x{{[^ ]*}} <col:11> 'int (S &)' lvalue CXXMethod 0x{{[^ ]*}} 'f' 'int (S &)'
+}
+}

From 65c36179be68dda0d1cc5d7e5c5b312a6b52cc0e Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Tue, 26 Nov 2024 21:50:35 +0800
Subject: [PATCH 09/41] [clang][analysis][NFC]add static for internal linkage
 function (#117481)

Detected by misc-use-internal-linkage
---
 clang/lib/Analysis/FlowSensitive/Arena.cpp       |  4 ++--
 .../FlowSensitive/DataflowEnvironment.cpp        | 14 ++++++++------
 .../FlowSensitive/Models/ChromiumCheckModel.cpp  |  5 +++--
 clang/lib/Analysis/IntervalPartition.cpp         | 16 ++++++++--------
 clang/lib/Analysis/UnsafeBufferUsage.cpp         |  3 ++-
 5 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Analysis/FlowSensitive/Arena.cpp b/clang/lib/Analysis/FlowSensitive/Arena.cpp
index 81137e8088e330..7542a137c735e0 100644
--- a/clang/lib/Analysis/FlowSensitive/Arena.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Arena.cpp
@@ -23,8 +23,8 @@ canonicalFormulaPair(const Formula &LHS, const Formula &RHS) {
 }
 
 template <class Key, class ComputeFunc>
-const Formula &cached(llvm::DenseMap<Key, const Formula *> &Cache, Key K,
-                      ComputeFunc &&Compute) {
+static const Formula &cached(llvm::DenseMap<Key, const Formula *> &Cache, Key K,
+                             ComputeFunc &&Compute) {
   auto [It, Inserted] = Cache.try_emplace(std::forward<Key>(K));
   if (Inserted)
     It->second = Compute();
diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index c5c6e900b79766..693313b322af1b 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -64,7 +64,8 @@ static llvm::DenseMap<const ValueDecl *, StorageLocation *> intersectDeclToLoc(
 // expression must map to the same location / value. This is the case if we are
 // performing a join for control flow within a full-expression (which is the
 // only case when this function should be used).
-template <typename MapT> MapT joinExprMaps(const MapT &Map1, const MapT &Map2) {
+template <typename MapT>
+static MapT joinExprMaps(const MapT &Map1, const MapT &Map2) {
   MapT Result = Map1;
 
   for (const auto &Entry : Map2) {
@@ -204,10 +205,11 @@ static WidenResult widenDistinctValues(QualType Type, Value &Prev,
 // Returns whether the values in `Map1` and `Map2` compare equal for those
 // keys that `Map1` and `Map2` have in common.
 template <typename Key>
-bool compareKeyToValueMaps(const llvm::MapVector<Key, Value *> &Map1,
-                           const llvm::MapVector<Key, Value *> &Map2,
-                           const Environment &Env1, const Environment &Env2,
-                           Environment::ValueModel &Model) {
+static bool compareKeyToValueMaps(const llvm::MapVector<Key, Value *> &Map1,
+                                  const llvm::MapVector<Key, Value *> &Map2,
+                                  const Environment &Env1,
+                                  const Environment &Env2,
+                                  Environment::ValueModel &Model) {
   for (auto &Entry : Map1) {
     Key K = Entry.first;
     assert(K != nullptr);
@@ -260,7 +262,7 @@ joinLocToVal(const llvm::MapVector<const StorageLocation *, Value *> &LocToVal,
 // Perform widening on either `LocToVal` or `ExprToVal`. `Key` must be either
 // `const StorageLocation *` or `const Expr *`.
 template <typename Key>
-llvm::MapVector<Key, Value *>
+static llvm::MapVector<Key, Value *>
 widenKeyToValueMap(const llvm::MapVector<Key, Value *> &CurMap,
                    const llvm::MapVector<Key, Value *> &PrevMap,
                    Environment &CurEnv, const Environment &PrevEnv,
diff --git a/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp
index 77d817dafe8378..02fd73754f01be 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/ChromiumCheckModel.cpp
@@ -16,8 +16,9 @@ namespace dataflow {
 
 /// Determines whether `D` is one of the methods used to implement Chromium's
 /// `CHECK` macros. Populates `CheckDecls`, if empty.
-bool isCheckLikeMethod(llvm::SmallDenseSet<const CXXMethodDecl *> &CheckDecls,
-                       const CXXMethodDecl &D) {
+static bool
+isCheckLikeMethod(llvm::SmallDenseSet<const CXXMethodDecl *> &CheckDecls,
+                  const CXXMethodDecl &D) {
   // All of the methods of interest are static, so avoid any lookup for
   // non-static methods (the common case).
   if (!D.isStatic())
diff --git a/clang/lib/Analysis/IntervalPartition.cpp b/clang/lib/Analysis/IntervalPartition.cpp
index 5f06606ec132e9..41199f358c5b97 100644
--- a/clang/lib/Analysis/IntervalPartition.cpp
+++ b/clang/lib/Analysis/IntervalPartition.cpp
@@ -36,8 +36,8 @@ static unsigned getID(const CFGIntervalNode &I) { return I.ID; }
 
 // `Node` must be one of `CFGBlock` or `CFGIntervalNode`.
 template <typename Node>
-BuildResult<Node> buildInterval(llvm::BitVector &Partitioned,
-                                const Node *Header) {
+static BuildResult<Node> buildInterval(llvm::BitVector &Partitioned,
+                                       const Node *Header) {
   assert(Header != nullptr);
   BuildResult<Node> Interval;
   Interval.Nodes.push_back(Header);
@@ -102,10 +102,10 @@ BuildResult<Node> buildInterval(llvm::BitVector &Partitioned,
 }
 
 template <typename Node>
-void fillIntervalNode(CFGIntervalGraph &Graph,
-                      std::vector<CFGIntervalNode *> &Index,
-                      std::queue<const Node *> &Successors,
-                      llvm::BitVector &Partitioned, const Node *Header) {
+static void fillIntervalNode(CFGIntervalGraph &Graph,
+                             std::vector<CFGIntervalNode *> &Index,
+                             std::queue<const Node *> &Successors,
+                             llvm::BitVector &Partitioned, const Node *Header) {
   BuildResult<Node> Result = buildInterval(Partitioned, Header);
   for (const auto *S : Result.Successors)
     Successors.push(S);
@@ -138,8 +138,8 @@ void fillIntervalNode(CFGIntervalGraph &Graph,
 }
 
 template <typename Node>
-CFGIntervalGraph partitionIntoIntervalsImpl(unsigned NumBlockIDs,
-                                            const Node *EntryBlock) {
+static CFGIntervalGraph partitionIntoIntervalsImpl(unsigned NumBlockIDs,
+                                                   const Node *EntryBlock) {
   assert(EntryBlock != nullptr);
   CFGIntervalGraph Graph;
   // `Index` maps all of the nodes of the input graph to the interval to which
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 5f36ffa926b269..321097e16a45f7 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -2326,7 +2326,8 @@ static StringRef getEndOfLine() {
 }
 
 // Returns the text indicating that the user needs to provide input there:
-std::string getUserFillPlaceHolder(StringRef HintTextToUser = "placeholder") {
+static std::string
+getUserFillPlaceHolder(StringRef HintTextToUser = "placeholder") {
   std::string s = std::string("<# ");
   s += HintTextToUser;
   s += " #>";

From 46fcdbbc78717767551594d5d9174db0bac1b375 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 26 Nov 2024 13:55:37 +0000
Subject: [PATCH 10/41] [InstCombine] Add alias.scope & noalias metadata to
 test.

---
 llvm/test/Transforms/InstCombine/loadstore-metadata.ll | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
index 3fdba7cfae67e1..4976eace72a9f9 100644
--- a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
+++ b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
@@ -285,8 +285,8 @@ define double @preserve_load_metadata_after_select_transform_metadata_missing_4(
 ; CHECK-NEXT:    ret double [[L_SEL]]
 ;
 entry:
-  %l.a = load double, ptr %a, align 8, !tbaa !0, !llvm.access.group !7
-  %l.b = load double, ptr %b, align 8, !tbaa !0, !llvm.access.group !12
+  %l.a = load double, ptr %a, align 8, !tbaa !0, !llvm.access.group !7, !alias.scope !3, !noalias !3
+  %l.b = load double, ptr %b, align 8, !tbaa !0, !llvm.access.group !12, !alias.scope !14, !noalias !14
   %cmp.i = fcmp fast olt double %l.a, %l.b
   %ptr.sel = select i1 %cmp.i, ptr %b, ptr %a
   %l.sel = load double, ptr %ptr.sel, align 8, !tbaa !0, !llvm.access.group !13
@@ -307,6 +307,10 @@ entry:
 !11 = !{i32 5, i32 6}
 !12 = distinct !{}
 !13 = distinct !{}
+!14 = !{!15}
+!15 = distinct !{!15, !16}
+!16 = distinct !{!16}
+
 ;.
 ; CHECK: [[TBAA0]] = !{[[LOOP1]], [[LOOP1]], i64 0}
 ; CHECK: [[LOOP1]] = !{!"scalar type", [[META2:![0-9]+]]}

From f4379db49683a6b1d3d63b577985312556373c6f Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01@loongson.cn>
Date: Tue, 26 Nov 2024 21:57:29 +0800
Subject: [PATCH 11/41] [LoongArch] Support LA V1.1 feature that div.w[u] and
 mod.w[u] instructions with inputs not signed-extended. (#116764)

Two options for clang
-mdiv32: Use div.w[u] and mod.w[u] instructions with input not
sign-extended.
-mno-div32: Do not use div.w[u] and mod.w[u] instructions with input not
sign-extended.
The default is -mno-div32.
---
 clang/include/clang/Driver/Options.td         |  4 +
 clang/lib/Basic/Targets/LoongArch.cpp         |  8 +-
 clang/lib/Basic/Targets/LoongArch.h           |  2 +
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |  9 +++
 clang/test/Driver/loongarch-march.c           |  8 +-
 clang/test/Driver/loongarch-mdiv32.c          | 30 +++++++
 clang/test/Preprocessor/init-loongarch.c      | 27 ++++---
 .../TargetParser/LoongArchTargetParser.def    |  3 +-
 .../llvm/TargetParser/LoongArchTargetParser.h |  2 +
 llvm/lib/Target/LoongArch/LoongArch.td        |  9 ++-
 .../LoongArch/LoongArchISelLowering.cpp       |  5 +-
 llvm/lib/TargetParser/Host.cpp                |  2 +-
 .../TargetParser/LoongArchTargetParser.cpp    |  1 +
 .../sdiv-udiv-srem-urem-div32.ll              | 81 +++++++++++++++++++
 14 files changed, 173 insertions(+), 18 deletions(-)
 create mode 100644 clang/test/Driver/loongarch-mdiv32.c
 create mode 100644 llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem-div32.ll

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 40fd48761928b3..7e3f6aaaa10dbe 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5417,6 +5417,10 @@ def mld_seq_sa : Flag<["-"], "mld-seq-sa">, Group<m_loongarch_Features_Group>,
   HelpText<"Do not generate load-load barrier instructions (dbar 0x700)">;
 def mno_ld_seq_sa : Flag<["-"], "mno-ld-seq-sa">, Group<m_loongarch_Features_Group>,
   HelpText<"Generate load-load barrier instructions (dbar 0x700)">;
+def mdiv32 : Flag<["-"], "mdiv32">, Group<m_loongarch_Features_Group>,
+  HelpText<"Use div.w[u] and mod.w[u] instructions with input not sign-extended.">;
+def mno_div32 : Flag<["-"], "mno-div32">, Group<m_loongarch_Features_Group>,
+  HelpText<"Do not use div.w[u] and mod.w[u] instructions with input not sign-extended.">;
 def mannotate_tablejump : Flag<["-"], "mannotate-tablejump">, Group<m_loongarch_Features_Group>,
   HelpText<"Enable annotate table jump instruction to correlate it with the jump table.">;
 def mno_annotate_tablejump : Flag<["-"], "mno-annotate-tablejump">, Group<m_loongarch_Features_Group>,
diff --git a/clang/lib/Basic/Targets/LoongArch.cpp b/clang/lib/Basic/Targets/LoongArch.cpp
index 3f2d7317532aaf..9ed695358b1ca8 100644
--- a/clang/lib/Basic/Targets/LoongArch.cpp
+++ b/clang/lib/Basic/Targets/LoongArch.cpp
@@ -205,7 +205,8 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
       // TODO: As more features of the V1.1 ISA are supported, a unified "v1.1"
       // arch feature set will be used to include all sub-features belonging to
       // the V1.1 ISA version.
-      if (HasFeatureFrecipe && HasFeatureLAM_BH && HasFeatureLD_SEQ_SA)
+      if (HasFeatureFrecipe && HasFeatureLAM_BH && HasFeatureLD_SEQ_SA &&
+          HasFeatureDiv32)
         Builder.defineMacro("__loongarch_arch",
                             Twine('"') + "la64v1.1" + Twine('"'));
       else
@@ -242,6 +243,9 @@ void LoongArchTargetInfo::getTargetDefines(const LangOptions &Opts,
   if (HasFeatureLD_SEQ_SA)
     Builder.defineMacro("__loongarch_ld_seq_sa", Twine(1));
 
+  if (HasFeatureDiv32)
+    Builder.defineMacro("__loongarch_div32", Twine(1));
+
   StringRef ABI = getABI();
   if (ABI == "lp64d" || ABI == "lp64f" || ABI == "lp64s")
     Builder.defineMacro("__loongarch_lp64");
@@ -322,6 +326,8 @@ bool LoongArchTargetInfo::handleTargetFeatures(
       HasFeatureLAM_BH = true;
     else if (Feature == "+ld-seq-sa")
       HasFeatureLD_SEQ_SA = true;
+    else if (Feature == "+div32")
+      HasFeatureDiv32 = true;
   }
   return true;
 }
diff --git a/clang/lib/Basic/Targets/LoongArch.h b/clang/lib/Basic/Targets/LoongArch.h
index e5eae7a8fcf677..3002a0bbc4491f 100644
--- a/clang/lib/Basic/Targets/LoongArch.h
+++ b/clang/lib/Basic/Targets/LoongArch.h
@@ -32,6 +32,7 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
   bool HasFeatureFrecipe;
   bool HasFeatureLAM_BH;
   bool HasFeatureLD_SEQ_SA;
+  bool HasFeatureDiv32;
 
 public:
   LoongArchTargetInfo(const llvm::Triple &Triple, const TargetOptions &)
@@ -43,6 +44,7 @@ class LLVM_LIBRARY_VISIBILITY LoongArchTargetInfo : public TargetInfo {
     HasFeatureFrecipe = false;
     HasFeatureLAM_BH = false;
     HasFeatureLD_SEQ_SA = false;
+    HasFeatureDiv32 = false;
     LongDoubleWidth = 128;
     LongDoubleAlign = 128;
     LongDoubleFormat = &llvm::APFloat::IEEEquad();
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 67b71a3ec623e4..5be57e866d85e8 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -283,6 +283,15 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     else
       Features.push_back("-ld-seq-sa");
   }
+
+  // Select div32 feature determined by -m[no-]div32.
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_mdiv32, options::OPT_mno_div32)) {
+    if (A->getOption().matches(options::OPT_mdiv32))
+      Features.push_back("+div32");
+    else
+      Features.push_back("-div32");
+  }
 }
 
 std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
diff --git a/clang/test/Driver/loongarch-march.c b/clang/test/Driver/loongarch-march.c
index c7091336f3bc80..981ae5c5c7dc1c 100644
--- a/clang/test/Driver/loongarch-march.c
+++ b/clang/test/Driver/loongarch-march.c
@@ -39,21 +39,21 @@
 
 // CC1-LA64V1P1: "-target-cpu" "loongarch64"
 // CC1-LA64V1P1-NOT: "-target-feature"
-// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+ld-seq-sa"
+// CC1-LA64V1P1: "-target-feature" "+64bit" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+ld-seq-sa" "-target-feature" "+div32"
 // CC1-LA64V1P1-NOT: "-target-feature"
 // CC1-LA64V1P1: "-target-abi" "lp64d"
 
 // CC1-LA664: "-target-cpu" "la664"
 // CC1-LA664-NOT: "-target-feature"
-// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+ld-seq-sa"
+// CC1-LA664: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual" "-target-feature" "+frecipe" "-target-feature" "+lam-bh" "-target-feature" "+ld-seq-sa" "-target-feature" "+div32"
 // CC1-LA664-NOT: "-target-feature"
 // CC1-LA664: "-target-abi" "lp64d"
 
 // IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+f,+ual"
 // IR-LA464: attributes #[[#]] ={{.*}}"target-cpu"="la464" {{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual"
 // IR-LA64V1P0: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+lsx,+ual"
-// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+frecipe,+lam-bh,+ld-seq-sa,+lsx,+ual"
-// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+f,+frecipe,+lam-bh,+lasx,+ld-seq-sa,+lsx,+ual"
+// IR-LA64V1P1: attributes #[[#]] ={{.*}}"target-cpu"="loongarch64" {{.*}}"target-features"="+64bit,+d,+div32,+frecipe,+lam-bh,+ld-seq-sa,+lsx,+ual"
+// IR-LA664: attributes #[[#]] ={{.*}}"target-cpu"="la664" {{.*}}"target-features"="+64bit,+d,+div32,+f,+frecipe,+lam-bh,+lasx,+ld-seq-sa,+lsx,+ual"
 
 int foo(void) {
   return 3;
diff --git a/clang/test/Driver/loongarch-mdiv32.c b/clang/test/Driver/loongarch-mdiv32.c
new file mode 100644
index 00000000000000..cf774b3818c55d
--- /dev/null
+++ b/clang/test/Driver/loongarch-mdiv32.c
@@ -0,0 +1,30 @@
+/// Test -m[no]div32 options.
+
+// RUN: %clang --target=loongarch64 -mdiv32 -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-DIV32
+// RUN: %clang --target=loongarch64 -mno-div32 -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-NO-DIV32
+// RUN: %clang --target=loongarch64 -mno-div32 -mdiv32 -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-DIV32
+// RUN: %clang --target=loongarch64  -mdiv32 -mno-div32 -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-NO-DIV32
+
+// RUN: %clang --target=loongarch64 -mdiv32 -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-DIV32
+// RUN: %clang --target=loongarch64 -mno-div32 -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-NO-DIV32
+// RUN: %clang --target=loongarch64 -mno-div32 -mdiv32 -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-DIV32
+// RUN: %clang --target=loongarch64 -mdiv32 -mno-div32 -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-NO-DIV32
+
+
+// CC1-DIV32: "-target-feature" "+div32"
+// CC1-NO-DIV32: "-target-feature" "-div32"
+
+// IR-DIV32: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+div32{{(,.*)?}}"
+// IR-NO-DIV32: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-div32{{(,.*)?}}"
+
+int foo(void) {
+  return 42;
+}
diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c
index 0e3320f01b328c..9045073cbb7893 100644
--- a/clang/test/Preprocessor/init-loongarch.c
+++ b/clang/test/Preprocessor/init-loongarch.c
@@ -798,7 +798,7 @@
 // LA64-FPU0-LP64S-NOT: #define __loongarch_single_float
 // LA64-FPU0-LP64S: #define __loongarch_soft_float 1
 
-/// Check __loongarch_arch{_tune/_frecipe/_lam_bh/_ld_seq_sa}.
+/// Check __loongarch_arch{_tune/_frecipe/_lam_bh/_ld_seq_sa/_div32}.
 
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
@@ -823,11 +823,11 @@
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 | \
-// RUN:   FileCheck --match-full-lines  --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA -DARCH=la64v1.1 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines  --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA,DIV32 -DARCH=la64v1.1 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -frecipe | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH,LD-SEQ-SA -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH,LD-SEQ-SA,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lsx | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA,DIV32 -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +frecipe | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +frecipe | \
@@ -835,7 +835,7 @@
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +lam-bh | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -lam-bh | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LD-SEQ-SA -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LD-SEQ-SA,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lam-bh | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LAM-BH -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +lam-bh | \
@@ -843,23 +843,32 @@
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +ld-seq-sa | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LD-SEQ-SA -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -ld-seq-sa | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +ld-seq-sa | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LD-SEQ-SA -DARCH=loongarch64 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +ld-seq-sa | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,LD-SEQ-SA -DARCH=la64v1.0 -DTUNE=loongarch64 %s
-// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe -Xclang -target-feature -Xclang +lam-bh  -Xclang -target-feature -Xclang +ld-seq-sa | \
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +div32 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.1 -Xclang -target-feature -Xclang -div32| \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +div32 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,DIV32 -DARCH=loongarch64 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -Xclang -target-feature -Xclang +lsx -Xclang -target-feature -Xclang +div32 | \
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,DIV32 -DARCH=la64v1.0 -DTUNE=loongarch64 %s
+// RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la64v1.0 -Xclang -target-feature -Xclang +frecipe -Xclang -target-feature -Xclang +lam-bh  -Xclang -target-feature -Xclang +ld-seq-sa -Xclang -target-feature -Xclang +div32 | \
 // RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE -DARCH=la64v1.1 -DTUNE=loongarch64 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA -DARCH=la664 -DTUNE=la664 %s
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA,DIV32 -DARCH=la664 -DTUNE=la664 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -mtune=la664 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=la64v1.0 -DTUNE=la664 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=loongarch64 -mtune=la664 | \
 // RUN:   FileCheck --match-full-lines --check-prefix=ARCH-TUNE -DARCH=loongarch64 -DTUNE=la664 %s
 // RUN: %clang --target=loongarch64 -x c -E -dM %s -o - -march=la664 -mtune=loongarch64 | \
-// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA -DARCH=la664 -DTUNE=loongarch64 %s
+// RUN:   FileCheck --match-full-lines --check-prefixes=ARCH-TUNE,FRECIPE,LAM-BH,LD-SEQ-SA,DIV32 -DARCH=la664 -DTUNE=loongarch64 %s
 
 // ARCH-TUNE: #define __loongarch_arch "[[ARCH]]"
+// DIV32: #define __loongarch_div32 1
 // FRECIPE: #define __loongarch_frecipe 1
 // LAM-BH: #define __loongarch_lam_bh 1
 // LD-SEQ-SA: #define __loongarch_ld_seq_sa 1
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
index 324d5c18e6dea3..e3285f89ef9eab 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
@@ -13,6 +13,7 @@ LOONGARCH_FEATURE("+ual", FK_UAL)
 LOONGARCH_FEATURE("+frecipe", FK_FRECIPE)
 LOONGARCH_FEATURE("+lam-bh", FK_LAM_BH)
 LOONGARCH_FEATURE("+ld-seq-sa", FK_LD_SEQ_SA)
+LOONGARCH_FEATURE("+div32", FK_DIV32)
 
 #undef LOONGARCH_FEATURE
 
@@ -22,6 +23,6 @@ LOONGARCH_FEATURE("+ld-seq-sa", FK_LD_SEQ_SA)
 
 LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
 LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)
-LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE | FK_LAM_BH | FK_LD_SEQ_SA)
+LOONGARCH_ARCH("la664", AK_LA664, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL | FK_FRECIPE | FK_LAM_BH | FK_LD_SEQ_SA | FK_DIV32)
 
 #undef LOONGARCH_ARCH
diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
index 00957b84ab576c..5862becc92d774 100644
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@@ -57,6 +57,8 @@ enum FeatureKind : uint32_t {
   // Do not generate load-load barrier instructions (dbar 0x700).
   FK_LD_SEQ_SA = 1 << 12,
 
+  // Assume div.w[u] and mod.w[u] can handle inputs that are not sign-extended.
+  FK_DIV32 = 1 << 13,
 };
 
 struct FeatureInfo {
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 100bdba36c440c..463786e72bdf8d 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -123,6 +123,12 @@ def FeatureLD_SEQ_SA
                         "Don't use load-load barrier (dbar 0x700).">;
 def HasLD_SEQ_SA : Predicate<"Subtarget->hasLD_SEQ_SA()">;
 
+// Assume div.w[u] and mod.w[u] can handle inputs that are not sign-extended.
+def FeatureDiv32
+    : SubtargetFeature<"div32", "HasDiv32", "true",
+                        "Assume div.w[u] and mod.w[u] can handle inputs that are not sign-extended">;
+def HasDiv32 : Predicate<"Subtarget->hasDiv32()">;
+
 def TunePreferWInst
     : SubtargetFeature<"prefer-w-inst", "PreferWInst", "true",
                        "Prefer instructions with W suffix">;
@@ -165,7 +171,8 @@ def : ProcessorModel<"la664", NoSchedModel, [Feature64Bit,
                                              FeatureExtLVZ,
                                              FeatureExtLBT,
                                              FeatureFrecipe,
-                                             FeatureLAM_BH]>;
+                                             FeatureLAM_BH,
+                                             FeatureDiv32]>;
 
 //===----------------------------------------------------------------------===//
 // Define the LoongArch target.
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 1abb428175eea7..6c30cc0d6cfb3e 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2833,7 +2833,10 @@ void LoongArchTargetLowering::ReplaceNodeResults(
   case ISD::UREM:
     assert(VT == MVT::i32 && Subtarget.is64Bit() &&
            "Unexpected custom legalisation");
-    Results.push_back(customLegalizeToWOp(N, DAG, 2, ISD::SIGN_EXTEND));
+    Results.push_back(customLegalizeToWOp(N, DAG, 2,
+                                          Subtarget.hasDiv32() && VT == MVT::i32
+                                              ? ISD::ANY_EXTEND
+                                              : ISD::SIGN_EXTEND));
     break;
   case ISD::SHL:
   case ISD::SRA:
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 51d6b7cb9b1fd6..05d97e03a67c9c 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -2025,12 +2025,12 @@ const StringMap<bool> sys::getHostCPUFeatures() {
   Features["lvz"] = hwcap & (1UL << 9);  // HWCAP_LOONGARCH_LVZ
 
   Features["frecipe"] = cpucfg2 & (1U << 25); // CPUCFG.2.FRECIPE
+  Features["div32"] = cpucfg2 & (1U << 26);   // CPUCFG.2.DIV32
   Features["lam-bh"] = cpucfg2 & (1U << 27);  // CPUCFG.2.LAM_BH
 
   Features["ld-seq-sa"] = cpucfg3 & (1U << 23); // CPUCFG.3.LD_SEQ_SA
 
   // TODO: Need to complete.
-  // Features["div32"] = cpucfg2 & (1U << 26);       // CPUCFG.2.DIV32
   // Features["lamcas"] = cpucfg2 & (1U << 28);      // CPUCFG.2.LAMCAS
   // Features["llacq-screl"] = cpucfg2 & (1U << 29); // CPUCFG.2.LLACQ_SCREL
   // Features["scq"] = cpucfg2 & (1U << 30);         // CPUCFG.2.SCQ
diff --git a/llvm/lib/TargetParser/LoongArchTargetParser.cpp b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
index 9b8407a73bea3f..8e7681d526cef5 100644
--- a/llvm/lib/TargetParser/LoongArchTargetParser.cpp
+++ b/llvm/lib/TargetParser/LoongArchTargetParser.cpp
@@ -54,6 +54,7 @@ bool LoongArch::getArchFeatures(StringRef Arch,
       Features.push_back("+frecipe");
       Features.push_back("+lam-bh");
       Features.push_back("+ld-seq-sa");
+      Features.push_back("+div32");
     }
     return true;
   }
diff --git a/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem-div32.ll b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem-div32.ll
new file mode 100644
index 00000000000000..b7745ff5e0c42f
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/sdiv-udiv-srem-urem-div32.ll
@@ -0,0 +1,81 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 -mattr=+d,-div32 < %s | FileCheck %s --check-prefix=LA64
+; RUN: llc --mtriple=loongarch64 -mattr=+d,+div32 < %s | FileCheck %s --check-prefix=LA64-DIV32
+
+; TODO: Use div.w/mod.w for sdiv/srem i32
+
+define i32 @divw(i64 %a, i64 %b) {
+; LA64-LABEL: divw:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    addi.w $a1, $a1, 0
+; LA64-NEXT:    div.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-DIV32-LABEL: divw:
+; LA64-DIV32:       # %bb.0:
+; LA64-DIV32-NEXT:    addi.w $a0, $a0, 0
+; LA64-DIV32-NEXT:    addi.w $a1, $a1, 0
+; LA64-DIV32-NEXT:    div.d $a0, $a0, $a1
+; LA64-DIV32-NEXT:    ret
+  %conv1 = trunc i64 %a to i32
+  %conv2 = trunc i64 %b to i32
+  %r = sdiv i32 %conv1, %conv2
+  ret i32 %r
+}
+
+define i32  @divwu(i64 %a, i64 %b) {
+; LA64-LABEL: divwu:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
+; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    div.wu $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-DIV32-LABEL: divwu:
+; LA64-DIV32:       # %bb.0:
+; LA64-DIV32-NEXT:    div.wu $a0, $a0, $a1
+; LA64-DIV32-NEXT:    ret
+  %conv1 = trunc i64 %a to i32
+  %conv2 = trunc i64 %b to i32
+  %r = udiv i32 %conv1, %conv2
+  ret i32 %r
+}
+
+define i32 @modw(i64 %a, i64 %b) {
+; LA64-LABEL: modw:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    addi.w $a1, $a1, 0
+; LA64-NEXT:    mod.d $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-DIV32-LABEL: modw:
+; LA64-DIV32:       # %bb.0:
+; LA64-DIV32-NEXT:    addi.w $a0, $a0, 0
+; LA64-DIV32-NEXT:    addi.w $a1, $a1, 0
+; LA64-DIV32-NEXT:    mod.d $a0, $a0, $a1
+; LA64-DIV32-NEXT:    ret
+  %conv1 = trunc i64 %a to i32
+  %conv2 = trunc i64 %b to i32
+  %r = srem i32 %conv1, %conv2
+  ret i32 %r
+}
+
+define i32 @modwu(i64 %a, i64 %b) {
+; LA64-LABEL: modwu:
+; LA64:       # %bb.0:
+; LA64-NEXT:    addi.w $a1, $a1, 0
+; LA64-NEXT:    addi.w $a0, $a0, 0
+; LA64-NEXT:    mod.wu $a0, $a0, $a1
+; LA64-NEXT:    ret
+;
+; LA64-DIV32-LABEL: modwu:
+; LA64-DIV32:       # %bb.0:
+; LA64-DIV32-NEXT:    mod.wu $a0, $a0, $a1
+; LA64-DIV32-NEXT:    ret
+  %conv1 = trunc i64 %a to i32
+  %conv2 = trunc i64 %b to i32
+  %r = urem i32 %conv1, %conv2
+  ret i32 %r
+}

From 537343dea4e65ddb837473c9349884e856664ad8 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Tue, 26 Nov 2024 14:53:44 +0100
Subject: [PATCH 12/41] Revert "[BOLT] DataAggregator support for binaries with
 multiple text segments (#92815)"

This caused test failures, see comment on the PR:

  Failed Tests (2):
    BOLT-Unit :: Core/./CoreTests/AArch64/MemoryMapsTester/MultipleSegmentsMismatchedBaseAddress/0
    BOLT-Unit :: Core/./CoreTests/X86/MemoryMapsTester/MultipleSegmentsMismatchedBaseAddress/0

> When a binary has multiple text segments, the Size is computed as the
> difference of the last address of these segments from the BaseAddress.
> The base addresses of all text segments must be the same.
>
> Introduces flag 'perf-script-events' for testing. It allows passing perf events
> without BOLT having to parse them using 'perf script'. The flag is used to
> pass a mock perf profile that has two memory mappings for a mock binary
> that has two text segments. The size of the mapping is updated as this
> change `parseMMapEvents` processes all text segments.

This reverts commit 4b71b3782d217db0138b701c4514bd2168ca1659.
---
 bolt/include/bolt/Profile/DataAggregator.h    |   8 -
 bolt/lib/Profile/DataAggregator.cpp           |  43 ++----
 bolt/unittests/Core/CMakeLists.txt            |   3 -
 bolt/unittests/Core/MemoryMaps.cpp            | 142 ------------------
 .../gn/secondary/bolt/unittests/Core/BUILD.gn |   1 -
 5 files changed, 14 insertions(+), 183 deletions(-)
 delete mode 100644 bolt/unittests/Core/MemoryMaps.cpp

diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 320623cfa15af1..2880bfd03be789 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -170,9 +170,6 @@ class DataAggregator : public DataReader {
   std::string BuildIDBinaryName;
 
   /// Memory map info for a single file as recorded in perf.data
-  /// When a binary has multiple text segments, the Size is computed as the
-  /// difference of the last address of these segments from the BaseAddress.
-  /// The base addresses of all text segments must be the same.
   struct MMapInfo {
     uint64_t BaseAddress{0}; /// Base address of the mapped binary.
     uint64_t MMapAddress{0}; /// Address of the executable segment.
@@ -496,11 +493,6 @@ class DataAggregator : public DataReader {
   /// and return a file name matching a given \p FileBuildID.
   std::optional<StringRef> getFileNameForBuildID(StringRef FileBuildID);
 
-  /// Get a constant reference to the parsed binary mmap entries.
-  const std::unordered_map<uint64_t, MMapInfo> &getBinaryMMapInfo() {
-    return BinaryMMapInfo;
-  }
-
   friend class YAMLProfileWriter;
 };
 } // namespace bolt
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 2b02086e3e0c99..697cac9fbcaa08 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -95,12 +95,6 @@ cl::opt<bool> ReadPreAggregated(
     "pa", cl::desc("skip perf and read data from a pre-aggregated file format"),
     cl::cat(AggregatorCategory));
 
-cl::opt<std::string>
-    ReadPerfEvents("perf-script-events",
-                   cl::desc("skip perf event collection by supplying a "
-                            "perf-script output in a textual format"),
-                   cl::ReallyHidden, cl::init(""), cl::cat(AggregatorCategory));
-
 static cl::opt<bool>
 TimeAggregator("time-aggr",
   cl::desc("time BOLT aggregator"),
@@ -173,9 +167,8 @@ void DataAggregator::findPerfExecutable() {
 void DataAggregator::start() {
   outs() << "PERF2BOLT: Starting data aggregation job for " << Filename << "\n";
 
-  // Don't launch perf for pre-aggregated files or when perf input is specified
-  // by the user.
-  if (opts::ReadPreAggregated || !opts::ReadPerfEvents.empty())
+  // Don't launch perf for pre-aggregated files
+  if (opts::ReadPreAggregated)
     return;
 
   findPerfExecutable();
@@ -471,13 +464,6 @@ void DataAggregator::filterBinaryMMapInfo() {
 
 int DataAggregator::prepareToParse(StringRef Name, PerfProcessInfo &Process,
                                    PerfProcessErrorCallbackTy Callback) {
-  if (!opts::ReadPerfEvents.empty()) {
-    outs() << "PERF2BOLT: using pre-processed perf events for '" << Name
-           << "' (perf-script-events)\n";
-    ParsingBuf = opts::ReadPerfEvents;
-    return 0;
-  }
-
   std::string Error;
   outs() << "PERF2BOLT: waiting for perf " << Name
          << " collection to finish...\n";
@@ -2070,6 +2056,15 @@ std::error_code DataAggregator::parseMMapEvents() {
     if (FileMMapInfo.first == "(deleted)")
       continue;
 
+    // Consider only the first mapping of the file for any given PID
+    auto Range = GlobalMMapInfo.equal_range(FileMMapInfo.first);
+    bool PIDExists = llvm::any_of(make_range(Range), [&](const auto &MI) {
+      return MI.second.PID == FileMMapInfo.second.PID;
+    });
+
+    if (PIDExists)
+      continue;
+
     GlobalMMapInfo.insert(FileMMapInfo);
   }
 
@@ -2121,22 +2116,12 @@ std::error_code DataAggregator::parseMMapEvents() {
                << " using file offset 0x" << Twine::utohexstr(MMapInfo.Offset)
                << ". Ignoring profile data for this mapping\n";
         continue;
+      } else {
+        MMapInfo.BaseAddress = *BaseAddress;
       }
-      MMapInfo.BaseAddress = *BaseAddress;
     }
 
-    // Try to add MMapInfo to the map and update its size. Large binaries may
-    // span to multiple text segments, so the mapping is inserted only on the
-    // first occurrence.
-    if (!BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo)).second)
-      assert(MMapInfo.BaseAddress == BinaryMMapInfo[MMapInfo.PID].BaseAddress &&
-             "Base address on multiple segment mappings should match");
-
-    // Update mapping size.
-    const uint64_t EndAddress = MMapInfo.MMapAddress + MMapInfo.Size;
-    const uint64_t Size = EndAddress - BinaryMMapInfo[MMapInfo.PID].BaseAddress;
-    if (Size > BinaryMMapInfo[MMapInfo.PID].Size)
-      BinaryMMapInfo[MMapInfo.PID].Size = Size;
+    BinaryMMapInfo.insert(std::make_pair(MMapInfo.PID, MMapInfo));
   }
 
   if (BinaryMMapInfo.empty()) {
diff --git a/bolt/unittests/Core/CMakeLists.txt b/bolt/unittests/Core/CMakeLists.txt
index 208cf6ced73585..bad7108dad0b7b 100644
--- a/bolt/unittests/Core/CMakeLists.txt
+++ b/bolt/unittests/Core/CMakeLists.txt
@@ -8,7 +8,6 @@ set(LLVM_LINK_COMPONENTS
 add_bolt_unittest(CoreTests
   BinaryContext.cpp
   MCPlusBuilder.cpp
-  MemoryMaps.cpp
   DynoStats.cpp
 
   DISABLE_LLVM_LINK_LLVM_DYLIB
@@ -18,8 +17,6 @@ target_link_libraries(CoreTests
   PRIVATE
   LLVMBOLTCore
   LLVMBOLTRewrite
-  LLVMBOLTProfile
-  LLVMTestingSupport
   )
 
 foreach (tgt ${BOLT_TARGETS_TO_BUILD})
diff --git a/bolt/unittests/Core/MemoryMaps.cpp b/bolt/unittests/Core/MemoryMaps.cpp
deleted file mode 100644
index 9b5769d051cb6f..00000000000000
--- a/bolt/unittests/Core/MemoryMaps.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//===- bolt/unittest/Core/MemoryMaps.cpp ----------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "bolt/Core/BinaryContext.h"
-#include "bolt/Profile/DataAggregator.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Testing/Support/Error.h"
-#include "gtest/gtest.h"
-
-using namespace llvm;
-using namespace llvm::object;
-using namespace llvm::ELF;
-using namespace bolt;
-
-namespace opts {
-extern cl::opt<std::string> ReadPerfEvents;
-} // namespace opts
-
-namespace {
-
-/// Perform checks on memory map events normally captured in perf. Tests use
-/// the 'opts::ReadPerfEvents' flag to emulate these events, passing a custom
-/// 'perf script' output to DataAggregator.
-struct MemoryMapsTester : public testing::TestWithParam<Triple::ArchType> {
-  void SetUp() override {
-    initalizeLLVM();
-    prepareElf();
-    initializeBOLT();
-  }
-
-protected:
-  void initalizeLLVM() {
-    llvm::InitializeAllTargetInfos();
-    llvm::InitializeAllTargetMCs();
-    llvm::InitializeAllAsmParsers();
-    llvm::InitializeAllDisassemblers();
-    llvm::InitializeAllTargets();
-    llvm::InitializeAllAsmPrinters();
-  }
-
-  void prepareElf() {
-    memcpy(ElfBuf, "\177ELF", 4);
-    ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
-    EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
-    EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
-    EHdr->e_machine = GetParam() == Triple::aarch64 ? EM_AARCH64 : EM_X86_64;
-    MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
-    ObjFile = cantFail(ObjectFile::createObjectFile(Source));
-  }
-
-  void initializeBOLT() {
-    Relocation::Arch = ObjFile->makeTriple().getArch();
-    BC = cantFail(BinaryContext::createBinaryContext(
-        ObjFile->makeTriple(), ObjFile->getFileName(), nullptr, true,
-        DWARFContext::create(*ObjFile.get()), {llvm::outs(), llvm::errs()}));
-    ASSERT_FALSE(!BC);
-  }
-
-  char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
-  std::unique_ptr<ObjectFile> ObjFile;
-  std::unique_ptr<BinaryContext> BC;
-};
-} // namespace
-
-#ifdef X86_AVAILABLE
-
-INSTANTIATE_TEST_SUITE_P(X86, MemoryMapsTester,
-                         ::testing::Values(Triple::x86_64));
-
-#endif
-
-#ifdef AARCH64_AVAILABLE
-
-INSTANTIATE_TEST_SUITE_P(AArch64, MemoryMapsTester,
-                         ::testing::Values(Triple::aarch64));
-
-#endif
-
-/// Check that the correct mmap size is computed when we have multiple text
-/// segment mappings.
-TEST_P(MemoryMapsTester, ParseMultipleSegments) {
-  const int Pid = 1234;
-  StringRef Filename = "BINARY";
-  opts::ReadPerfEvents = formatv(
-      "name       0 [000]     0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
-      "[0xabc0000000(0x1000000) @ 0x11c0000 103:01 1573523 0]: r-xp {1}\n"
-      "name       0 [000]     0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
-      "[0xabc2000000(0x8000000) @ 0x31d0000 103:01 1573523 0]: r-xp {1}\n",
-      Pid, Filename);
-
-  BC->SegmentMapInfo[0x11da000] =
-      SegmentInfo{0x11da000, 0x10da000, 0x11ca000, 0x10da000, 0x10000, true};
-  BC->SegmentMapInfo[0x31d0000] =
-      SegmentInfo{0x31d0000, 0x51ac82c, 0x31d0000, 0x3000000, 0x200000, true};
-
-  DataAggregator DA("");
-  BC->setFilename(Filename);
-  Error Err = DA.preprocessProfile(*BC);
-
-  // Ignore errors from perf2bolt when parsing memory events later on.
-  ASSERT_THAT_ERROR(std::move(Err), Succeeded());
-
-  auto &BinaryMMapInfo = DA.getBinaryMMapInfo();
-  auto El = BinaryMMapInfo.find(Pid);
-  // Check that memory mapping is present and has the expected size.
-  ASSERT_NE(El, BinaryMMapInfo.end());
-  ASSERT_EQ(El->second.Size, static_cast<uint64_t>(0xb1d0000));
-}
-
-/// Check that DataAggregator aborts when pre-processing an input binary
-/// with multiple text segments that have different base addresses.
-TEST_P(MemoryMapsTester, MultipleSegmentsMismatchedBaseAddress) {
-  const int Pid = 1234;
-  StringRef Filename = "BINARY";
-  opts::ReadPerfEvents = formatv(
-      "name       0 [000]     0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
-      "[0xabc0000000(0x1000000) @ 0x11c0000 103:01 1573523 0]: r-xp {1}\n"
-      "name       0 [000]     0.000000: PERF_RECORD_MMAP2 {0}/{0}: "
-      "[0xabc2000000(0x8000000) @ 0x31d0000 103:01 1573523 0]: r-xp {1}\n",
-      Pid, Filename);
-
-  BC->SegmentMapInfo[0x11da000] =
-      SegmentInfo{0x11da000, 0x10da000, 0x11ca000, 0x10da000, 0x10000, true};
-  // Using '0x31d0fff' FileOffset which triggers a different base address
-  // for this second text segment.
-  BC->SegmentMapInfo[0x31d0000] =
-      SegmentInfo{0x31d0000, 0x51ac82c, 0x31d0fff, 0x3000000, 0x200000, true};
-
-  DataAggregator DA("");
-  BC->setFilename(Filename);
-  ASSERT_DEATH(
-      { Error Err = DA.preprocessProfile(*BC); },
-      "Base address on multiple segment mappings should match");
-}
diff --git a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
index 945d31afca10f0..51dc24481a513b 100644
--- a/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
+++ b/llvm/utils/gn/secondary/bolt/unittests/Core/BUILD.gn
@@ -15,7 +15,6 @@ unittest("CoreTests") {
     "BinaryContext.cpp",
     "DynoStats.cpp",
     "MCPlusBuilder.cpp",
-    "MemoryMaps.cpp",
   ]
 
   defines = []

From ead3a2f5980e1a713c8d4e18a4c825e1012b3701 Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Tue, 26 Nov 2024 22:05:54 +0800
Subject: [PATCH 13/41] [SLP][REVEC] getScalarizationOverhead should not be
 used when ScalarTy is FixedVectorType. (#117536)

---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 16 ++++++++--
 .../SLPVectorizer/SystemZ/revec-fix-117393.ll | 30 +++++++++++++++++++
 2 files changed, 44 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 115cbd4d2ce5e4..48a8520a966fc7 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9614,8 +9614,20 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
     Cost += ::getShuffleCost(*TTI, TTI::SK_InsertSubvector, VecTy, {}, CostKind,
                              Idx, getWidenedType(ScalarTy, Sz));
   }
-  Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
-                                        /*Extract=*/false, CostKind);
+  if (auto *FTy = dyn_cast<FixedVectorType>(ScalarTy)) {
+    assert(SLPReVec && "Only supported by REVEC.");
+    // If ScalarTy is FixedVectorType, we should use CreateInsertVector instead
+    // of CreateInsertElement.
+    unsigned ScalarTyNumElements = getNumElements(ScalarTy);
+    for (unsigned I : seq<unsigned>(TE.Scalars.size()))
+      if (DemandedElts[I])
+        Cost +=
+            TTI->getShuffleCost(TTI::SK_InsertSubvector, VecTy, std::nullopt,
+                                CostKind, I * ScalarTyNumElements, FTy);
+  } else {
+    Cost += TTI->getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true,
+                                          /*Extract=*/false, CostKind);
+  }
   int Sz = TE.Scalars.size();
   SmallVector<int> ReorderMask(TE.ReorderIndices.begin(),
                                TE.ReorderIndices.end());
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
new file mode 100644
index 00000000000000..c40e32baad7b31
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-117393.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -mtriple=systemz-unknown -mcpu=z15 -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
+
+define void @h() {
+; CHECK-LABEL: @h(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = or <4 x i32> [[TMP0]], zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = or <4 x i32> splat (i32 1), zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <4 x i32> zeroinitializer, zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = or <4 x i32> [[TMP3]], zeroinitializer
+; CHECK-NEXT:    [[TMP5:%.*]] = and <4 x i32> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = and <4 x i32> zeroinitializer, [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = and <4 x i32> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> [[TMP7]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = shl <4 x i32> zeroinitializer, zeroinitializer
+  %1 = or <4 x i32> %0, zeroinitializer
+  %2 = or <4 x i32> splat (i32 1), zeroinitializer
+  %3 = or <4 x i32> zeroinitializer, zeroinitializer
+  %4 = shl <4 x i32> zeroinitializer, zeroinitializer
+  %5 = or <4 x i32> %4, zeroinitializer
+  %6 = and <4 x i32> %2, %1
+  %7 = and <4 x i32> %3, %6
+  %8 = and <4 x i32> %5, %7
+  %9 = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %8)
+  ret void
+}

From 59b3630e032d7e92079667891e7cf585c7fe313d Mon Sep 17 00:00:00 2001
From: Victor Perez <victor.perez@codeplay.com>
Date: Tue, 26 Nov 2024 15:08:31 +0100
Subject: [PATCH 14/41] [MLIR][SPIR-V] Drop commas from split barrier
 operations ASM format (#116673)

Drop commas from split barrier operations assembly format.

Signed-off-by: Victor Perez <victor.perez@codeplay.com>


Depends on #116648, review ec8d35471602cd88aa2ebaf239b698ef3ba353bd
only.

---------

Signed-off-by: Victor Perez <victor.perez@codeplay.com>
---
 mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtOps.td    | 6 +++---
 mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir | 4 ++--
 mlir/test/Dialect/SPIRV/IR/intel-ext-ops.mlir             | 8 ++++----
 mlir/test/Target/SPIRV/intel-ext-ops.mlir                 | 8 ++++----
 4 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtOps.td
index 8ff7d0d63469fd..82d26e365fb243 100644
--- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtOps.td
+++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVIntelExtOps.td
@@ -131,7 +131,7 @@ class SPIRV_IntelSplitBarrierOp<string mnemonic>
   let results = (outs);
 
   let assemblyFormat = [{
-    $execution_scope `,` $memory_scope `,` $memory_semantics attr-dict
+    $execution_scope $memory_scope $memory_semantics attr-dict
   }];
 
   let hasVerifier = 0;
@@ -160,7 +160,7 @@ def SPIRV_INTELControlBarrierArriveOp
     #### Example:
 
     ```mlir
-    spirv.ControlBarrierArrive <Workgroup>, <Device>, <Acquire|UniformMemory>
+    spirv.ControlBarrierArrive <Workgroup> <Device> <Acquire|UniformMemory>
     ```
   }];
 }
@@ -194,7 +194,7 @@ def SPIRV_INTELControlBarrierWaitOp
     #### Example:
 
     ```mlir
-    spirv.ControlBarrierWait <Workgroup>, <Device>, <Acquire|UniformMemory>
+    spirv.ControlBarrierWait <Workgroup> <Device> <Acquire|UniformMemory>
     ```
   }];
 }
diff --git a/mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir b/mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir
index a5cae67a3d5c5d..359aa350ac90c6 100644
--- a/mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir
+++ b/mlir/test/Conversion/SPIRVToLLVM/barrier-ops-to-llvm.mlir
@@ -37,12 +37,12 @@ spirv.func @split_barrier() "None" {
   // CHECK:         [[MEMORY:%.*]] = llvm.mlir.constant(2 : i32) : i32
   // CHECK:         [[SEMANTICS:%.*]] = llvm.mlir.constant(768 : i32) : i32
   // CHECK:         llvm.call spir_funccc @_Z33__spirv_ControlBarrierArriveINTELiii([[EXECUTION]], [[MEMORY]], [[SEMANTICS]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> ()
-  spirv.INTEL.ControlBarrierArrive <Workgroup>, <Workgroup>, <CrossWorkgroupMemory|WorkgroupMemory>
+  spirv.INTEL.ControlBarrierArrive <Workgroup> <Workgroup> <CrossWorkgroupMemory|WorkgroupMemory>
 
   // CHECK:         [[EXECUTION:%.*]] = llvm.mlir.constant(2 : i32) : i32
   // CHECK:         [[MEMORY:%.*]] = llvm.mlir.constant(2 : i32) : i32
   // CHECK:         [[SEMANTICS:%.*]] = llvm.mlir.constant(256 : i32) : i32
   // CHECK:         llvm.call spir_funccc @_Z31__spirv_ControlBarrierWaitINTELiii([[EXECUTION]], [[MEMORY]], [[SEMANTICS]]) {convergent, no_unwind, will_return} : (i32, i32, i32) -> ()
-  spirv.INTEL.ControlBarrierWait <Workgroup>, <Workgroup>, <WorkgroupMemory>
+  spirv.INTEL.ControlBarrierWait <Workgroup> <Workgroup> <WorkgroupMemory>
   spirv.Return
 }
diff --git a/mlir/test/Dialect/SPIRV/IR/intel-ext-ops.mlir b/mlir/test/Dialect/SPIRV/IR/intel-ext-ops.mlir
index 6dd0353d9374ad..bb15d018a6c448 100644
--- a/mlir/test/Dialect/SPIRV/IR/intel-ext-ops.mlir
+++ b/mlir/test/Dialect/SPIRV/IR/intel-ext-ops.mlir
@@ -77,10 +77,10 @@ spirv.func @bf16_to_f32_vec_unsupported(%arg0 : vector<2xi16>) "None" {
 //===----------------------------------------------------------------------===//
 
 spirv.func @split_barrier() "None" {
-  // CHECK: spirv.INTEL.ControlBarrierArrive <Workgroup>, <Device>, <Acquire|UniformMemory>
-  spirv.INTEL.ControlBarrierArrive <Workgroup>, <Device>, <Acquire|UniformMemory>
-  // CHECK: spirv.INTEL.ControlBarrierWait <Workgroup>, <Device>, <Acquire|UniformMemory>
-  spirv.INTEL.ControlBarrierWait <Workgroup>, <Device>, <Acquire|UniformMemory>
+  // CHECK: spirv.INTEL.ControlBarrierArrive <Workgroup> <Device> <Acquire|UniformMemory>
+  spirv.INTEL.ControlBarrierArrive <Workgroup> <Device> <Acquire|UniformMemory>
+  // CHECK: spirv.INTEL.ControlBarrierWait <Workgroup> <Device> <Acquire|UniformMemory>
+  spirv.INTEL.ControlBarrierWait <Workgroup> <Device> <Acquire|UniformMemory>
   spirv.Return
 }
 
diff --git a/mlir/test/Target/SPIRV/intel-ext-ops.mlir b/mlir/test/Target/SPIRV/intel-ext-ops.mlir
index 8c50501cf7409d..6d2fd324363c62 100644
--- a/mlir/test/Target/SPIRV/intel-ext-ops.mlir
+++ b/mlir/test/Target/SPIRV/intel-ext-ops.mlir
@@ -40,10 +40,10 @@ spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [Bfloat16ConversionINTEL]
 spirv.module Logical GLSL450 requires #spirv.vce<v1.0, [SplitBarrierINTEL], [SPV_INTEL_split_barrier]> {
   // CHECK-LABEL: @split_barrier
   spirv.func @split_barrier() "None" {
-    // CHECK: spirv.INTEL.ControlBarrierArrive <Workgroup>, <Device>, <Acquire|UniformMemory>
-    spirv.INTEL.ControlBarrierArrive <Workgroup>, <Device>, <Acquire|UniformMemory>
-    // CHECK: spirv.INTEL.ControlBarrierWait <Workgroup>, <Device>, <Acquire|UniformMemory>
-    spirv.INTEL.ControlBarrierWait <Workgroup>, <Device>, <Acquire|UniformMemory>
+    // CHECK: spirv.INTEL.ControlBarrierArrive <Workgroup> <Device> <Acquire|UniformMemory>
+    spirv.INTEL.ControlBarrierArrive <Workgroup> <Device> <Acquire|UniformMemory>
+    // CHECK: spirv.INTEL.ControlBarrierWait <Workgroup> <Device> <Acquire|UniformMemory>
+    spirv.INTEL.ControlBarrierWait <Workgroup> <Device> <Acquire|UniformMemory>
     spirv.Return
   }
 }

From 619e4b7154606f315572ba54c0fe6c1f6c8848a0 Mon Sep 17 00:00:00 2001
From: 7FM <chill_dein_leben@gmx.de>
Date: Tue, 26 Nov 2024 15:11:00 +0100
Subject: [PATCH 15/41] [MLIR][Arith] SelectOp fix invalid folding (#117555)

The pattern `select %x, true, false => %x` is only valid in case that
the return type is identical to the type of `%x` (i.e., i1). Hence, the
check `isInteger(1)` was replaced with `isSignlessInteger(1)`.

Fixes: https://github.com/llvm/llvm-project/issues/117554
---
 mlir/lib/Dialect/Arith/IR/ArithOps.cpp    |  3 ++-
 mlir/test/Dialect/Arith/canonicalize.mlir | 12 ++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
index 47766f36ad05cf..16b4e8eb4f022c 100644
--- a/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
+++ b/mlir/lib/Dialect/Arith/IR/ArithOps.cpp
@@ -2322,7 +2322,8 @@ OpFoldResult arith::SelectOp::fold(FoldAdaptor adaptor) {
     return trueVal;
 
   // select %x, true, false => %x
-  if (getType().isInteger(1) && matchPattern(adaptor.getTrueValue(), m_One()) &&
+  if (getType().isSignlessInteger(1) &&
+      matchPattern(adaptor.getTrueValue(), m_One()) &&
       matchPattern(adaptor.getFalseValue(), m_Zero()))
     return condition;
 
diff --git a/mlir/test/Dialect/Arith/canonicalize.mlir b/mlir/test/Dialect/Arith/canonicalize.mlir
index f56bf0980b13c1..1d4d5fc6f8319a 100644
--- a/mlir/test/Dialect/Arith/canonicalize.mlir
+++ b/mlir/test/Dialect/Arith/canonicalize.mlir
@@ -54,6 +54,18 @@ func.func @select_extui_i1(%arg0: i1) -> i1 {
   return %res : i1
 }
 
+// CHECK-LABEL: @select_no_fold_ui1
+//       CHECK:  %[[CONST_0:.+]] = "test.constant"() <{value = 0 : i32}> : () -> ui1
+//       CHECK:  %[[CONST_1:.+]] = "test.constant"() <{value = 1 : i32}> : () -> ui1
+//  CHECK-NEXT:  %[[RES:.+]] = arith.select %arg0, %[[CONST_1]], %[[CONST_0]] : ui1
+//  CHECK-NEXT:   return %[[RES]]
+func.func @select_no_fold_ui1(%arg0: i1) -> ui1 {
+  %c0_i1 = "test.constant"() {value = 0 : i32} : () -> ui1
+  %c1_i1 = "test.constant"() {value = 1 : i32} : () -> ui1
+  %res = arith.select %arg0, %c1_i1, %c0_i1 : ui1
+  return %res : ui1
+}
+
 // CHECK-LABEL: @select_cst_false_scalar
 //  CHECK-SAME:   (%[[ARG0:.+]]: i32, %[[ARG1:.+]]: i32)
 //  CHECK-NEXT:   return %[[ARG1]]

From f4d758634305304c0deb49a4ed3f99180a2488ea Mon Sep 17 00:00:00 2001
From: Jakub Kuderski <jakub@nod-labs.com>
Date: Tue, 26 Nov 2024 09:11:36 -0500
Subject: [PATCH 16/41] [mlir] Use `llvm::filter_to_vector`. NFC. (#117655)

This got recently added to SmallVectorExtras:
https://github.com/llvm/llvm-project/pull/117460.
---
 .../Conversion/TosaToLinalg/TosaToLinalg.cpp   |  4 ++--
 mlir/lib/Dialect/LLVMIR/IR/LLVMInterfaces.cpp  |  7 +++----
 mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp  | 12 ++++--------
 .../Dialect/Linalg/Transforms/Transforms.cpp   |  4 ++--
 mlir/lib/Dialect/Shape/IR/Shape.cpp            |  4 ++--
 mlir/lib/IR/Operation.cpp                      |  8 ++++----
 mlir/lib/IR/TypeUtilities.cpp                  |  4 ++--
 mlir/lib/Interfaces/DataLayoutInterfaces.cpp   |  8 ++++----
 mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp  | 18 +++++++++---------
 9 files changed, 32 insertions(+), 37 deletions(-)

diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 5291f95d371442..cdbcd3013e139a 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -701,9 +701,9 @@ computeTargetSize(PatternRewriter &rewriter, Location loc, IndexPool &indexPool,
 
   // Filter operands with dynamic dimension
   auto operandsWithDynamicDim =
-      llvm::to_vector(llvm::make_filter_range(operands, [&](Value operand) {
+      llvm::filter_to_vector(operands, [&](Value operand) {
         return cast<RankedTensorType>(operand.getType()).isDynamicDim(dim);
-      }));
+      });
 
   // If no operand has a dynamic dimension, it means all sizes were 1
   if (operandsWithDynamicDim.empty())
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMInterfaces.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMInterfaces.cpp
index a59900745d026e..ca1277c09323b8 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMInterfaces.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMInterfaces.cpp
@@ -99,10 +99,9 @@ SmallVector<Value> mlir::LLVM::MemsetInlineOp::getAccessedOperands() {
 }
 
 SmallVector<Value> mlir::LLVM::CallOp::getAccessedOperands() {
-  return llvm::to_vector(
-      llvm::make_filter_range(getArgOperands(), [](Value arg) {
-        return isa<LLVMPointerType>(arg.getType());
-      }));
+  return llvm::filter_to_vector(getArgOperands(), [](Value arg) {
+    return isa<LLVMPointerType>(arg.getType());
+  });
 }
 
 #include "mlir/Dialect/LLVMIR/LLVMInterfaces.cpp.inc"
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
index 31f37334ce3978..61bab2ed675307 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp
@@ -375,10 +375,8 @@ static void calculateTileOffsetsAndSizes(
   b.setInsertionPointToStart(forallOp.getBody(0));
 
   SmallVector<Value> threadIds = forallOp.getInductionVars();
-  SmallVector<OpFoldResult> nonZeroNumThreads =
-      llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
-        return !isConstantIntValue(ofr, 0);
-      }));
+  SmallVector<OpFoldResult> nonZeroNumThreads = llvm::filter_to_vector(
+      numThreads, [](OpFoldResult ofr) { return !isConstantIntValue(ofr, 0); });
   int64_t nLoops = loopRanges.size();
   tiledOffsets.reserve(nLoops);
   tiledSizes.reserve(nLoops);
@@ -656,10 +654,8 @@ FailureOr<linalg::ForallReductionTilingResult> linalg::tileReductionUsingForall(
 
   Operation *tiledOp = nullptr;
 
-  SmallVector<OpFoldResult> nonZeroNumThreads =
-      llvm::to_vector(llvm::make_filter_range(numThreads, [](OpFoldResult ofr) {
-        return !isConstantIntValue(ofr, 0);
-      }));
+  SmallVector<OpFoldResult> nonZeroNumThreads = llvm::filter_to_vector(
+      numThreads, [](OpFoldResult ofr) { return !isConstantIntValue(ofr, 0); });
   SmallVector<Value> materializedNonZeroNumThreads =
       getValueOrCreateConstantIndexOp(b, loc, nonZeroNumThreads);
 
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index c3e176299317ef..eeaa70c0b65892 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1090,8 +1090,8 @@ getPackUnpackNormalizedPerm(int rank, ArrayRef<int64_t> perm) {
   SmallVector<int64_t> vec(rank, kNonTiledMarker);
   for (auto [index, value] : llvm::enumerate(perm))
     vec[value] = index;
-  SmallVector<int64_t> normalizedPerm = llvm::to_vector(llvm::make_filter_range(
-      vec, [&](int64_t v) { return v != kNonTiledMarker; }));
+  SmallVector<int64_t> normalizedPerm = llvm::filter_to_vector(
+      vec, [&](int64_t v) { return v != kNonTiledMarker; });
   // This inverts the permutation in addition to normalizing so invert back.
   return invertPermutationVector(normalizedPerm);
 }
diff --git a/mlir/lib/Dialect/Shape/IR/Shape.cpp b/mlir/lib/Dialect/Shape/IR/Shape.cpp
index 8eb8e579954faa..bebfaa8c1ea822 100644
--- a/mlir/lib/Dialect/Shape/IR/Shape.cpp
+++ b/mlir/lib/Dialect/Shape/IR/Shape.cpp
@@ -695,8 +695,8 @@ struct RemoveEmptyShapeOperandsPattern : public OpRewritePattern<OpTy> {
       }
       return true;
     };
-    auto newOperands = llvm::to_vector<8>(
-        llvm::make_filter_range(op->getOperands(), isPotentiallyNonEmptyShape));
+    auto newOperands = llvm::filter_to_vector<8>(op->getOperands(),
+                                                 isPotentiallyNonEmptyShape);
 
     // Reduce op to equivalent without empty shape operands.
     if (newOperands.size() < op.getNumOperands()) {
diff --git a/mlir/lib/IR/Operation.cpp b/mlir/lib/IR/Operation.cpp
index 3272ece65ba531..fe0fee0f8db2ce 100644
--- a/mlir/lib/IR/Operation.cpp
+++ b/mlir/lib/IR/Operation.cpp
@@ -1309,10 +1309,10 @@ LogicalResult OpTrait::impl::verifyNoRegionArguments(Operation *op) {
 
 LogicalResult OpTrait::impl::verifyElementwise(Operation *op) {
   auto isMappableType = llvm::IsaPred<VectorType, TensorType>;
-  auto resultMappableTypes = llvm::to_vector<1>(
-      llvm::make_filter_range(op->getResultTypes(), isMappableType));
-  auto operandMappableTypes = llvm::to_vector<2>(
-      llvm::make_filter_range(op->getOperandTypes(), isMappableType));
+  auto resultMappableTypes =
+      llvm::filter_to_vector<1>(op->getResultTypes(), isMappableType);
+  auto operandMappableTypes =
+      llvm::filter_to_vector<2>(op->getOperandTypes(), isMappableType);
 
   // If the op only has scalar operand/result types, then we have nothing to
   // check.
diff --git a/mlir/lib/IR/TypeUtilities.cpp b/mlir/lib/IR/TypeUtilities.cpp
index e569d440ca95c4..ec646cad841ae5 100644
--- a/mlir/lib/IR/TypeUtilities.cpp
+++ b/mlir/lib/IR/TypeUtilities.cpp
@@ -141,8 +141,8 @@ LogicalResult mlir::verifyCompatibleShapes(TypeRange types) {
   }
 
   // Remove all unranked shapes
-  auto shapes = llvm::to_vector<8>(llvm::make_filter_range(
-      shapedTypes, [](auto shapedType) { return shapedType.hasRank(); }));
+  auto shapes = llvm::filter_to_vector<8>(
+      shapedTypes, [](auto shapedType) { return shapedType.hasRank(); });
   if (shapes.empty())
     return success();
 
diff --git a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
index 9469780129d644..1c661e3beea48e 100644
--- a/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
+++ b/mlir/lib/Interfaces/DataLayoutInterfaces.cpp
@@ -304,11 +304,11 @@ mlir::detail::getDevicePropertyValue(DataLayoutEntryInterface entry) {
 DataLayoutEntryList
 mlir::detail::filterEntriesForType(DataLayoutEntryListRef entries,
                                    TypeID typeID) {
-  return llvm::to_vector<4>(llvm::make_filter_range(
+  return llvm::filter_to_vector<4>(
       entries, [typeID](DataLayoutEntryInterface entry) {
         auto type = llvm::dyn_cast_if_present<Type>(entry.getKey());
         return type && type.getTypeID() == typeID;
-      }));
+      });
 }
 
 DataLayoutEntryInterface
@@ -393,9 +393,9 @@ static DataLayoutSpecInterface getCombinedDataLayout(Operation *leaf) {
 
   // Create the list of non-null specs (null/missing specs can be safely
   // ignored) from the outermost to the innermost.
-  auto nonNullSpecs = llvm::to_vector<2>(llvm::make_filter_range(
+  auto nonNullSpecs = llvm::filter_to_vector<2>(
       llvm::reverse(specs),
-      [](DataLayoutSpecInterface iface) { return iface != nullptr; }));
+      [](DataLayoutSpecInterface iface) { return iface != nullptr; });
 
   // Combine the specs using the innermost as anchor.
   if (DataLayoutSpecInterface current = getSpec(leaf))
diff --git a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
index d7967c7a77534d..da28ca3a7eba97 100644
--- a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
@@ -10,6 +10,7 @@
 #include "mlir/TableGen/GenInfo.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVectorExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/TableGen/Error.h"
@@ -161,13 +162,12 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
     return formatv("read{0}", capitalize(name));
   };
 
-  auto parsedArgs =
-      llvm::to_vector(make_filter_range(args, [](const Init *const attr) {
-        const Record *def = cast<DefInit>(attr)->getDef();
-        if (def->isSubClassOf("Array"))
-          return true;
-        return !def->getValueAsString("cParser").empty();
-      }));
+  auto parsedArgs = llvm::filter_to_vector(args, [](const Init *const attr) {
+    const Record *def = cast<DefInit>(attr)->getDef();
+    if (def->isSubClassOf("Array"))
+      return true;
+    return !def->getValueAsString("cParser").empty();
+  });
 
   interleave(
       zip(parsedArgs, argNames),
@@ -277,8 +277,8 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType,
   printParseConditional(ios, args, argNames);
 
   // Compute args to pass to create method.
-  auto passedArgs = llvm::to_vector(make_filter_range(
-      argNames, [](StringRef str) { return !str.starts_with("_"); }));
+  auto passedArgs = llvm::filter_to_vector(
+      argNames, [](StringRef str) { return !str.starts_with("_"); });
   std::string argStr;
   raw_string_ostream argStream(argStr);
   interleaveComma(passedArgs, argStream,

From 4028bb10c3a396023b877d025c5776d207f29f91 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 26 Nov 2024 06:13:34 -0800
Subject: [PATCH 17/41] Local: Handle noalias_addrspace in combineMetadata
 (#103938)

This should act like range.

Previously ConstantRangeList assumed a 64-bit range. Now query from the
actual entries. This also means that the empty range has no bitwidth, so
move asserts to avoid checking the bitwidth of empty ranges.
---
 llvm/include/llvm/IR/ConstantRangeList.h      |  7 +-
 llvm/include/llvm/IR/Metadata.h               |  1 +
 llvm/lib/IR/ConstantRangeList.cpp             | 15 ++--
 llvm/lib/IR/Metadata.cpp                      | 38 ++++++++++
 llvm/lib/Transforms/Utils/Local.cpp           |  8 +-
 .../Transforms/EarlyCSE/noalias-addrspace.ll  | 73 +++++++++++++++++++
 .../SimplifyCFG/hoist-with-metadata.ll        | 51 ++++++++++++-
 7 files changed, 181 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/Transforms/EarlyCSE/noalias-addrspace.ll

diff --git a/llvm/include/llvm/IR/ConstantRangeList.h b/llvm/include/llvm/IR/ConstantRangeList.h
index 44d1daebe49e4a..b12c913103df57 100644
--- a/llvm/include/llvm/IR/ConstantRangeList.h
+++ b/llvm/include/llvm/IR/ConstantRangeList.h
@@ -35,7 +35,7 @@ class [[nodiscard]] ConstantRangeList {
   ConstantRangeList(ArrayRef<ConstantRange> RangesRef) {
     assert(isOrderedRanges(RangesRef));
     for (const ConstantRange &R : RangesRef) {
-      assert(R.getBitWidth() == getBitWidth());
+      assert(empty() || R.getBitWidth() == getBitWidth());
       Ranges.push_back(R);
     }
   }
@@ -59,8 +59,9 @@ class [[nodiscard]] ConstantRangeList {
   /// Return true if this list contains no members.
   bool empty() const { return Ranges.empty(); }
 
-  /// Get the bit width of this ConstantRangeList.
-  uint32_t getBitWidth() const { return 64; }
+  /// Get the bit width of this ConstantRangeList. It is invalid to call this
+  /// with an empty range.
+  uint32_t getBitWidth() const { return Ranges.front().getBitWidth(); }
 
   /// Return the number of ranges in this ConstantRangeList.
   size_t size() const { return Ranges.size(); }
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index 70882767545164..35580f3f38c615 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -1456,6 +1456,7 @@ class MDNode : public Metadata {
   static MDNode *getMostGenericTBAA(MDNode *A, MDNode *B);
   static MDNode *getMostGenericFPMath(MDNode *A, MDNode *B);
   static MDNode *getMostGenericRange(MDNode *A, MDNode *B);
+  static MDNode *getMostGenericNoaliasAddrspace(MDNode *A, MDNode *B);
   static MDNode *getMostGenericAliasScope(MDNode *A, MDNode *B);
   static MDNode *getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B);
   /// Merge !prof metadata from two instructions.
diff --git a/llvm/lib/IR/ConstantRangeList.cpp b/llvm/lib/IR/ConstantRangeList.cpp
index 0856f79bb9191a..3ee8d6f22b7487 100644
--- a/llvm/lib/IR/ConstantRangeList.cpp
+++ b/llvm/lib/IR/ConstantRangeList.cpp
@@ -39,12 +39,14 @@ void ConstantRangeList::insert(const ConstantRange &NewRange) {
     return;
   assert(!NewRange.isFullSet() && "Do not support full set");
   assert(NewRange.getLower().slt(NewRange.getUpper()));
-  assert(getBitWidth() == NewRange.getBitWidth());
   // Handle common cases.
   if (empty() || Ranges.back().getUpper().slt(NewRange.getLower())) {
     Ranges.push_back(NewRange);
     return;
   }
+
+  assert(getBitWidth() == NewRange.getBitWidth());
+
   if (NewRange.getUpper().slt(Ranges.front().getLower())) {
     Ranges.insert(Ranges.begin(), NewRange);
     return;
@@ -142,14 +144,15 @@ void ConstantRangeList::subtract(const ConstantRange &SubRange) {
 
 ConstantRangeList
 ConstantRangeList::unionWith(const ConstantRangeList &CRL) const {
-  assert(getBitWidth() == CRL.getBitWidth() &&
-         "ConstantRangeList bitwidths don't agree!");
   // Handle common cases.
   if (empty())
     return CRL;
   if (CRL.empty())
     return *this;
 
+  assert(getBitWidth() == CRL.getBitWidth() &&
+         "ConstantRangeList bitwidths don't agree!");
+
   ConstantRangeList Result;
   size_t i = 0, j = 0;
   // "PreviousRange" tracks the lowest unioned range that is being processed.
@@ -192,15 +195,15 @@ ConstantRangeList::unionWith(const ConstantRangeList &CRL) const {
 
 ConstantRangeList
 ConstantRangeList::intersectWith(const ConstantRangeList &CRL) const {
-  assert(getBitWidth() == CRL.getBitWidth() &&
-         "ConstantRangeList bitwidths don't agree!");
-
   // Handle common cases.
   if (empty())
     return *this;
   if (CRL.empty())
     return CRL;
 
+  assert(getBitWidth() == CRL.getBitWidth() &&
+         "ConstantRangeList bitwidths don't agree!");
+
   ConstantRangeList Result;
   size_t i = 0, j = 0;
   while (i < size() && j < CRL.size()) {
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index 98cfbd11fde58f..e3a46840eea0e7 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/ConstantRangeList.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
@@ -1353,6 +1354,43 @@ MDNode *MDNode::getMostGenericRange(MDNode *A, MDNode *B) {
   return MDNode::get(A->getContext(), MDs);
 }
 
+MDNode *MDNode::getMostGenericNoaliasAddrspace(MDNode *A, MDNode *B) {
+  if (!A || !B)
+    return nullptr;
+
+  if (A == B)
+    return A;
+
+  SmallVector<ConstantRange> RangeListA, RangeListB;
+  for (unsigned I = 0, E = A->getNumOperands() / 2; I != E; ++I) {
+    auto *LowA = mdconst::extract<ConstantInt>(A->getOperand(2 * I + 0));
+    auto *HighA = mdconst::extract<ConstantInt>(A->getOperand(2 * I + 1));
+    RangeListA.push_back(ConstantRange(LowA->getValue(), HighA->getValue()));
+  }
+
+  for (unsigned I = 0, E = B->getNumOperands() / 2; I != E; ++I) {
+    auto *LowB = mdconst::extract<ConstantInt>(B->getOperand(2 * I + 0));
+    auto *HighB = mdconst::extract<ConstantInt>(B->getOperand(2 * I + 1));
+    RangeListB.push_back(ConstantRange(LowB->getValue(), HighB->getValue()));
+  }
+
+  ConstantRangeList CRLA(RangeListA);
+  ConstantRangeList CRLB(RangeListB);
+  ConstantRangeList Result = CRLA.intersectWith(CRLB);
+  if (Result.empty())
+    return nullptr;
+
+  SmallVector<Metadata *> MDs;
+  for (const ConstantRange &CR : Result) {
+    MDs.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(A->getContext(), CR.getLower())));
+    MDs.push_back(ConstantAsMetadata::get(
+        ConstantInt::get(A->getContext(), CR.getUpper())));
+  }
+
+  return MDNode::get(A->getContext(), MDs);
+}
+
 MDNode *MDNode::getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B) {
   if (!A || !B)
     return nullptr;
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index b74d2e8cff0aae..d574095d75c58c 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -3391,6 +3391,11 @@ void llvm::combineMetadata(Instruction *K, const Instruction *J,
         if (DoesKMove)
           K->setMetadata(Kind, MDNode::getMergedProfMetadata(KMD, JMD, K, J));
         break;
+      case LLVMContext::MD_noalias_addrspace:
+        if (DoesKMove)
+          K->setMetadata(Kind,
+                         MDNode::getMostGenericNoaliasAddrspace(JMD, KMD));
+        break;
     }
   }
   // Set !invariant.group from J if J has it. If both instructions have it
@@ -3432,7 +3437,8 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J,
                          LLVMContext::MD_prof,
                          LLVMContext::MD_nontemporal,
                          LLVMContext::MD_noundef,
-                         LLVMContext::MD_mmra};
+                         LLVMContext::MD_mmra,
+                         LLVMContext::MD_noalias_addrspace};
   combineMetadata(K, J, KnownIDs, KDominatesJ);
 }
 
diff --git a/llvm/test/Transforms/EarlyCSE/noalias-addrspace.ll b/llvm/test/Transforms/EarlyCSE/noalias-addrspace.ll
new file mode 100644
index 00000000000000..0a001b55f684cf
--- /dev/null
+++ b/llvm/test/Transforms/EarlyCSE/noalias-addrspace.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes='early-cse<memssa>' -S < %s | FileCheck %s
+
+declare void @use(i1)
+declare void @use.ptr(ptr) memory(read)
+
+define void @load_first_noalias_addrspace(ptr %p) {
+; CHECK-LABEL: define void @load_first_noalias_addrspace(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P]], align 8, !nonnull [[META0:![0-9]+]], !noundef [[META0]], !noalias.addrspace [[META1:![0-9]+]]
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    ret void
+;
+  %v1 = load ptr, ptr %p, !nonnull !{}, !noundef !{}, !noalias.addrspace !0
+  call void @use.ptr(ptr %v1)
+  %v2 = load ptr, ptr %p
+  call void @use.ptr(ptr %v2)
+  ret void
+}
+
+define void @load_both_same_noalias_addrspace(ptr %p) {
+; CHECK-LABEL: define void @load_both_same_noalias_addrspace(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P]], align 8, !nonnull [[META0]], !noundef [[META0]], !noalias.addrspace [[META1]]
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    ret void
+;
+  %v1 = load ptr, ptr %p, !nonnull !{}, !noundef !{}, !noalias.addrspace !0
+  call void @use.ptr(ptr %v1)
+  %v2 = load ptr, ptr %p, !noalias.addrspace !0
+  call void @use.ptr(ptr %v2)
+  ret void
+}
+
+define void @load_both_disjoint_noalias_addrspace(ptr %p) {
+; CHECK-LABEL: define void @load_both_disjoint_noalias_addrspace(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P]], align 8, !nonnull [[META0]], !noundef [[META0]], !noalias.addrspace [[META1]]
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    ret void
+;
+  %v1 = load ptr, ptr %p, !nonnull !{}, !noundef !{}, !noalias.addrspace !0
+  call void @use.ptr(ptr %v1)
+  %v2 = load ptr, ptr %p, !noalias.addrspace !1
+  call void @use.ptr(ptr %v2)
+  ret void
+}
+
+define void @load_both_overlap_noalias_addrspace(ptr %p) {
+; CHECK-LABEL: define void @load_both_overlap_noalias_addrspace(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = load ptr, ptr [[P]], align 8, !nonnull [[META0]], !noundef [[META0]], !noalias.addrspace [[META1]]
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    call void @use.ptr(ptr [[V1]])
+; CHECK-NEXT:    ret void
+;
+  %v1 = load ptr, ptr %p, !nonnull !{}, !noundef !{}, !noalias.addrspace !0
+  call void @use.ptr(ptr %v1)
+  %v2 = load ptr, ptr %p, !noalias.addrspace !2
+  call void @use.ptr(ptr %v2)
+  ret void
+}
+
+!0 = !{i32 5, i32 6}
+!1 = !{i32 7, i32 8}
+!2 = !{i32 5, i32 7}
+;.
+; CHECK: [[META0]] = !{}
+; CHECK: [[META1]] = !{i32 5, i32 6}
+;.
diff --git a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
index 18aa5c9e044a98..d34ac2bb300407 100644
--- a/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
+++ b/llvm/test/Transforms/SimplifyCFG/hoist-with-metadata.ll
@@ -319,7 +319,7 @@ out:
 define void @hoist_noalias_addrspace_both(i1 %c, ptr %p, i64 %val) {
 ; CHECK-LABEL: @hoist_noalias_addrspace_both(
 ; CHECK-NEXT:  if:
-; CHECK-NEXT:    [[T:%.*]] = atomicrmw add ptr [[P:%.*]], i64 [[VAL:%.*]] seq_cst, align 8
+; CHECK-NEXT:    [[T:%.*]] = atomicrmw add ptr [[P:%.*]], i64 [[VAL:%.*]] seq_cst, align 8, !noalias.addrspace [[META7:![0-9]+]]
 ; CHECK-NEXT:    ret void
 ;
 if:
@@ -361,7 +361,7 @@ out:
 define void @hoist_noalias_addrspace_switch(i64 %i, ptr %p, i64 %val) {
 ; CHECK-LABEL: @hoist_noalias_addrspace_switch(
 ; CHECK-NEXT:  out:
-; CHECK-NEXT:    [[T:%.*]] = atomicrmw add ptr [[P:%.*]], i64 [[VAL:%.*]] seq_cst, align 8
+; CHECK-NEXT:    [[T:%.*]] = atomicrmw add ptr [[P:%.*]], i64 [[VAL:%.*]] seq_cst, align 8, !noalias.addrspace [[META7]]
 ; CHECK-NEXT:    ret void
 ;
   switch i64 %i, label %bb0 [
@@ -381,6 +381,48 @@ out:
   ret void
 }
 
+define void @hoist_noalias_addrspace_switch_multiple(i64 %i, ptr %p, i64 %val) {
+; CHECK-LABEL: @hoist_noalias_addrspace_switch_multiple(
+; CHECK-NEXT:  out:
+; CHECK-NEXT:    [[T:%.*]] = atomicrmw add ptr [[P:%.*]], i64 [[VAL:%.*]] seq_cst, align 8, !noalias.addrspace [[META8:![0-9]+]]
+; CHECK-NEXT:    ret void
+;
+  switch i64 %i, label %bb0 [
+  i64 1, label %bb1
+  i64 2, label %bb2
+  ]
+bb0:
+  %t = atomicrmw add ptr %p, i64 %val seq_cst, !noalias.addrspace !7
+  br label %out
+bb1:
+  %e = atomicrmw add ptr %p, i64 %val seq_cst, !noalias.addrspace !8
+  br label %out
+bb2:
+  %f = atomicrmw add ptr %p, i64 %val seq_cst, !noalias.addrspace !9
+  br label %out
+out:
+  ret void
+}
+
+; !noalias_addrspace is not safe to speculate as it causes immediate undefined behavior.
+define ptr @speculate_noalias_addrspace(i1 %c, ptr dereferenceable(8) align 8 %p) {
+; CHECK-LABEL: @speculate_noalias_addrspace(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[V:%.*]] = load ptr, ptr [[P:%.*]], align 8, !nonnull [[META2]]
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[C:%.*]], ptr [[V]], ptr null
+; CHECK-NEXT:    ret ptr [[SPEC_SELECT]]
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  %v = load ptr, ptr %p, !nonnull !{}, !noundef !{}, !noalias.addrspace !4
+  br label %join
+
+join:
+  %phi = phi ptr [ %v, %if ], [ null, %entry ]
+  ret ptr %phi
+}
 
 !0 = !{ i8 0, i8 1 }
 !1 = !{ i8 3, i8 5 }
@@ -389,6 +431,9 @@ out:
 !4 = !{i32 5, i32 6}
 !5 = !{i32 5, i32 7}
 !6 = !{i32 4, i32 8}
+!7 = !{i32 4, i32 8, i32 20, i32 31}
+!8 = !{i32 2, i32 5}
+!9 = !{i32 2, i32 5, i32 22, i32 42, i32 45, i32 50}
 
 ;.
 ; CHECK: [[RNG0]] = !{i8 0, i8 1, i8 3, i8 5}
@@ -398,4 +443,6 @@ out:
 ; CHECK: [[RNG4]] = !{i32 0, i32 10}
 ; CHECK: [[META5]] = !{i64 4}
 ; CHECK: [[META6]] = !{float 2.500000e+00}
+; CHECK: [[META7]] = !{i32 5, i32 6}
+; CHECK: [[META8]] = !{i32 4, i32 5}
 ;.

From ab6677e7d64b4612d6c92877cb1d529f922268d2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 26 Nov 2024 14:16:16 +0000
Subject: [PATCH 18/41] [LICM] Only set AA metadata on hoisted load if it
 executes. (#117204)

https://github.com/llvm/llvm-project/pull/116220 clarified that
violations of aliasing metadata are UB.

Only set the AA metadata after hoisting a log, if it is guaranteed to
execute in the original loop.

PR: https://github.com/llvm/llvm-project/pull/117204
---
 llvm/lib/Transforms/Scalar/LICM.cpp                 | 13 ++++++++++---
 llvm/test/Transforms/LICM/hoist-metadata.ll         | 11 ++---------
 .../Transforms/LICM/hoisting-preheader-debugloc.ll  |  4 +++-
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 94bfe44a847a37..3ade3202728931 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -2030,7 +2030,9 @@ bool llvm::promoteLoopAccessesToScalars(
 
   bool DereferenceableInPH = false;
   bool StoreIsGuanteedToExecute = false;
+  bool LoadIsGuaranteedToExecute = false;
   bool FoundLoadToPromote = false;
+
   // Goes from Unknown to either Safe or Unsafe, but can't switch between them.
   enum {
     StoreSafe,
@@ -2089,6 +2091,10 @@ bool llvm::promoteLoopAccessesToScalars(
 
         Align InstAlignment = Load->getAlign();
 
+        if (!LoadIsGuaranteedToExecute)
+          LoadIsGuaranteedToExecute =
+              SafetyInfo->isGuaranteedToExecute(*UI, DT, CurLoop);
+
         // Note that proving a load safe to speculate requires proving
         // sufficient alignment at the target location.  Proving it guaranteed
         // to execute does as well.  Thus we can increase our guaranteed
@@ -2233,8 +2239,9 @@ bool llvm::promoteLoopAccessesToScalars(
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, ExitBlocks, InsertPts,
                         MSSAInsertPts, PIC, MSSAU, *LI, DL, Alignment,
-                        SawUnorderedAtomic, AATags, *SafetyInfo,
-                        StoreSafety == StoreSafe);
+                        SawUnorderedAtomic,
+                        StoreIsGuanteedToExecute ? AATags : AAMDNodes(),
+                        *SafetyInfo, StoreSafety == StoreSafe);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
@@ -2247,7 +2254,7 @@ bool llvm::promoteLoopAccessesToScalars(
       PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
     PreheaderLoad->setAlignment(Alignment);
     PreheaderLoad->setDebugLoc(DebugLoc());
-    if (AATags)
+    if (AATags && LoadIsGuaranteedToExecute)
       PreheaderLoad->setAAMetadata(AATags);
 
     MemoryAccess *PreheaderLoadMemoryAccess = MSSAU.createMemoryAccessInBB(
diff --git a/llvm/test/Transforms/LICM/hoist-metadata.ll b/llvm/test/Transforms/LICM/hoist-metadata.ll
index 60b61944b33ae0..f25cb966a37fc7 100644
--- a/llvm/test/Transforms/LICM/hoist-metadata.ll
+++ b/llvm/test/Transforms/LICM/hoist-metadata.ll
@@ -77,7 +77,7 @@ define void @noalias_metadata_load_may_not_execute() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A:%.*]] = alloca i32, align 16
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[A]]
-; CHECK-NEXT:    [[GEP_PROMOTED:%.*]] = load i32, ptr [[GEP]], align 4, !tbaa [[TBAA3:![0-9]+]], !noalias [[META7:![0-9]+]]
+; CHECK-NEXT:    [[GEP_PROMOTED:%.*]] = load i32, ptr [[GEP]], align 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
 ; CHECK:       loop.header:
 ; CHECK-NEXT:    [[ADD1:%.*]] = phi i32 [ [[GEP_PROMOTED]], [[ENTRY:%.*]] ], [ [[ADD:%.*]], [[LOOP_LATCH:%.*]] ]
@@ -92,7 +92,7 @@ define void @noalias_metadata_load_may_not_execute() {
 ; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_HEADER]], label [[EXIT]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[ADD2:%.*]] = phi i32 [ [[ADD]], [[LOOP_LATCH]] ], [ [[ADD1]], [[LOOP_HEADER]] ]
-; CHECK-NEXT:    store i32 [[ADD2]], ptr [[GEP]], align 4, !tbaa [[TBAA3]], !noalias [[META7]]
+; CHECK-NEXT:    store i32 [[ADD2]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -132,11 +132,4 @@ exit:
 ; CHECK: [[RNG0]] = !{i32 0, i32 10}
 ; CHECK: [[META1]] = !{}
 ; CHECK: [[META2]] = !{i64 4}
-; CHECK: [[TBAA3]] = !{[[META4:![0-9]+]], [[META4]], i64 0}
-; CHECK: [[META4]] = !{!"short", [[META5:![0-9]+]], i64 0}
-; CHECK: [[META5]] = !{!"omnipotent char", [[META6:![0-9]+]], i64 0}
-; CHECK: [[META6]] = !{!"Simple C/C++ TBAA"}
-; CHECK: [[META7]] = !{[[META8:![0-9]+]]}
-; CHECK: [[META8]] = distinct !{[[META8]], [[META9:![0-9]+]]}
-; CHECK: [[META9]] = distinct !{[[META9]]}
 ;.
diff --git a/llvm/test/Transforms/LICM/hoisting-preheader-debugloc.ll b/llvm/test/Transforms/LICM/hoisting-preheader-debugloc.ll
index 570f4230c1a90d..61f0eb19a9bd1b 100644
--- a/llvm/test/Transforms/LICM/hoisting-preheader-debugloc.ll
+++ b/llvm/test/Transforms/LICM/hoisting-preheader-debugloc.ll
@@ -1,6 +1,8 @@
 ; RUN: opt -passes=licm %s -S | FileCheck %s
 
-; CHECK: %arrayidx4.promoted = load i32, ptr %arrayidx4, align 4, !tbaa !{{[0-9]+$}}
+; CHECK: %arrayidx4.promoted = load i32, ptr %arrayidx4, align 4
+; CHECK-NOT: !dbg
+; CHECK: br label %for.body
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 

From 9efdebc5f15e284dc7c58d327057ec8af9eed342 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?M=C3=A9sz=C3=A1ros=20Gergely?= <gergely.meszaros@intel.com>
Date: Tue, 26 Nov 2024 15:16:50 +0100
Subject: [PATCH 19/41] [Clang] Only ignore special methods for unused private
 fields in BuildFieldReferenceExpr (#116965)

The original code assumed that only special methods might be defined as
defaulted. Since C++20 comparison operators might be defaulted too, and
we *do* want to consider those as using the fields of the class.

Fixes: #116961
---
 clang/docs/ReleaseNotes.rst                   |  2 ++
 clang/lib/Sema/SemaExprMember.cpp             | 12 +++++++++--
 .../SemaCXX/warn-unused-private-field.cpp     | 20 +++++++++++++++++++
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4f4b2ce03cd5c3..c4086a5bcbf368 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -591,6 +591,8 @@ Improvements to Clang's diagnostics
 - For an rvalue reference bound to a temporary struct with an integer member, Clang will detect constant integer overflow
   in the initializer for the integer member (#GH46755).
 
+- Fixed a false negative ``-Wunused-private-field`` diagnostic when a defaulted comparison operator is defined out of class (#GH116961).
+
 Improvements to Clang's time-trace
 ----------------------------------
 
diff --git a/clang/lib/Sema/SemaExprMember.cpp b/clang/lib/Sema/SemaExprMember.cpp
index 434768b99d631e..85d5dfcb3db6de 100644
--- a/clang/lib/Sema/SemaExprMember.cpp
+++ b/clang/lib/Sema/SemaExprMember.cpp
@@ -1874,8 +1874,16 @@ Sema::BuildFieldReferenceExpr(Expr *BaseExpr, bool IsArrow,
           Context.getAttributedType(attr::NoDeref, MemberType, MemberType);
   }
 
-  auto *CurMethod = dyn_cast<CXXMethodDecl>(CurContext);
-  if (!(CurMethod && CurMethod->isDefaulted()))
+  auto isDefaultedSpecialMember = [this](const DeclContext *Ctx) {
+    auto *Method = dyn_cast<CXXMethodDecl>(CurContext);
+    if (!Method || !Method->isDefaulted())
+      return false;
+
+    return getDefaultedFunctionKind(Method).isSpecialMember();
+  };
+
+  // Implicit special members should not mark fields as used.
+  if (!isDefaultedSpecialMember(CurContext))
     UnusedPrivateFields.remove(Field);
 
   ExprResult Base = PerformObjectMemberConversion(BaseExpr, SS.getScopeRep(),
diff --git a/clang/test/SemaCXX/warn-unused-private-field.cpp b/clang/test/SemaCXX/warn-unused-private-field.cpp
index 1128eacc309d9f..bf104b1a76a656 100644
--- a/clang/test/SemaCXX/warn-unused-private-field.cpp
+++ b/clang/test/SemaCXX/warn-unused-private-field.cpp
@@ -20,6 +20,26 @@ class SpaceShipDefaultCompare {
   int operator<=>(const SpaceShipDefaultCompare &) const = default;
 };
 
+class EqDefaultCompareOutOfClass {
+  int used; // no warning, the compiler generated AST for the comparison operator
+            // references the fields of the class, and this should be considered
+            // a use.
+            // This test case is needed because clang does not emit the body
+            // of the defaulted operator when it is defined in-class until it
+            // finds a call to it. `-Wunused-private-field` is suppressed in
+            // a different way in that case.
+  bool operator==(const EqDefaultCompareOutOfClass &) const;
+};
+
+bool EqDefaultCompareOutOfClass::operator==(const EqDefaultCompareOutOfClass &) const = default;
+
+class FriendEqDefaultCompareOutOfClass {
+  int used; // no warning, same reasoning just tested via a friend declaration.
+  friend bool operator==(const FriendEqDefaultCompareOutOfClass &, const FriendEqDefaultCompareOutOfClass &);
+};
+
+bool operator==(const FriendEqDefaultCompareOutOfClass &, const FriendEqDefaultCompareOutOfClass &) = default;
+
 #endif
 
 class NotFullyDefined {

From 624e52b1e310c349e21cc0b4f67452b0fa9d1f96 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 26 Nov 2024 14:24:25 +0000
Subject: [PATCH 20/41] [DebugInfo] Handle trailing empty blocks when seeking
 prologue_end spot (#117320)

The optimiser will produce empty blocks that are unconditionally
executed according to the CFG -- while it may not be meaningful code,
and won't get a prologue_end position, we need to not crash on this
input.

The fault comes from assuming that there's always a next block with some
instructions in it, that will eventually produce some meaningful control
flow to stop at -- in the given reproducer in issue #117206 this isn't
true, because the function terminates with `unreachable`. Thus, I've
refactored the "get next instruction logic" into a helper that'll step
through all blocks and terminate if there aren't any more.

Reproducer from aeubanks
---
 llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp    | 62 ++++++++++++-------
 .../MIR/X86/dbg-prologue-backup-loc2.mir      | 49 +++++++++++++++
 2 files changed, 87 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index affc72b5950fd9..e1291e2a14a66e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -2192,14 +2192,45 @@ findPrologueEndLoc(const MachineFunction *MF) {
   // better off synthesising an early prologue_end.
   auto CurBlock = MF->begin();
   auto CurInst = CurBlock->begin();
-  while (true) {
-    // Skip empty blocks, in rare cases the entry can be empty.
-    if (CurInst == CurBlock->end()) {
-      ++CurBlock;
-      CurInst = CurBlock->begin();
-      continue;
+
+  // Find the initial instruction, we're guaranteed one by the caller, but not
+  // which block it's in.
+  while (CurBlock->empty())
+    CurInst = (++CurBlock)->begin();
+  assert(CurInst != CurBlock->end());
+
+  // Helper function for stepping through the initial sequence of
+  // unconditionally executed instructions.
+  auto getNextInst = [&CurBlock, &CurInst, MF]() -> bool {
+    // We've reached the end of the block. Did we just look at a terminator?
+    if (CurInst->isTerminator()) {
+      // Some kind of "real" control flow is occurring. At the very least
+      // we would have to start exploring the CFG, a good signal that the
+      // prologue is over.
+      return false;
     }
 
+    // If we've already fallen through into a loop, don't fall through
+    // further, use a backup-location.
+    if (CurBlock->pred_size() > 1)
+      return false;
+
+    // Fall-through from entry to the next block. This is common at -O0 when
+    // there's no initialisation in the function. Bail if we're also at the
+    // end of the function, or the remaining blocks have no instructions.
+    // Skip empty blocks, in rare cases the entry can be empty, and
+    // other optimisations may add empty blocks that the control flow falls
+    // through.
+    do {
+      ++CurBlock;
+      if (CurBlock == MF->end())
+        return false;
+    } while (CurBlock->empty());
+    CurInst = CurBlock->begin();
+    return true;
+  };
+
+  while (true) {
     // Check whether this non-meta instruction a good position for prologue_end.
     if (!CurInst->isMetaInstruction()) {
       auto FoundInst = ExamineInst(*CurInst);
@@ -2216,25 +2247,8 @@ findPrologueEndLoc(const MachineFunction *MF) {
       continue;
     }
 
-    // We've reached the end of the block. Did we just look at a terminator?
-    if (CurInst->isTerminator()) {
-      // Some kind of "real" control flow is occurring. At the very least
-      // we would have to start exploring the CFG, a good signal that the
-      // prologue is over.
-      break;
-    }
-
-    // If we've already fallen through into a loop, don't fall through
-    // further, use a backup-location.
-    if (CurBlock->pred_size() > 1)
-      break;
-
-    // Fall-through from entry to the next block. This is common at -O0 when
-    // there's no initialisation in the function. Bail if we're also at the
-    // end of the function.
-    if (++CurBlock == MF->end())
+    if (!getNextInst())
       break;
-    CurInst = CurBlock->begin();
   }
 
   // We couldn't find any source-location, suggesting all meaningful information
diff --git a/llvm/test/DebugInfo/MIR/X86/dbg-prologue-backup-loc2.mir b/llvm/test/DebugInfo/MIR/X86/dbg-prologue-backup-loc2.mir
index c27655ac801316..29cdbc0853365b 100644
--- a/llvm/test/DebugInfo/MIR/X86/dbg-prologue-backup-loc2.mir
+++ b/llvm/test/DebugInfo/MIR/X86/dbg-prologue-backup-loc2.mir
@@ -31,6 +31,13 @@
 # CHECK-NEXT:   .LBB0_1:
 # CHECK-LABEL:  addl    %esi, %edx
 
+## Second function in this file: test that we don't crash when having trailing
+## empty blocks and no location for a prologue. Test that a .loc is produced,
+## with an implicit-not check for there being no prologue_end.
+#
+# CHECK-LABEL: f:
+# CHECK: .loc    0 1234 0
+
  
 --- |
   ; ModuleID = 'out2.ll'
@@ -66,6 +73,17 @@
     ret i32 0, !dbg !17
   }
   
+  define void @f() !dbg !18 {
+  entry:
+    %0 = call ptr @llvm.returnaddress(i32 0)
+    br label %do.body
+
+  do.body:
+    unreachable
+  }
+
+  declare ptr @llvm.returnaddress(i32 immarg)
+
   !llvm.dbg.cu = !{!2}
   !llvm.module.flags = !{!6, !7}
   !llvm.ident = !{!8}
@@ -88,6 +106,7 @@
   !15 = distinct !DILexicalBlock(scope: !9, file: !3, line: 4, column: 15)
   !16 = !DILocation(line: 6, column: 9, scope: !15)
   !17 = !DILocation(line: 8, column: 3, scope: !9)
+  !18 = distinct !DISubprogram(name: "f", scope: !3, file: !3, line: 37, type: !10, scopeLine: 1234, flags: DIFlagPrototyped, spFlags: DISPFlagLocalToUnit | DISPFlagDefinition, unit: !2)
 
 ...
 ---
@@ -132,3 +151,33 @@ body:             |
     RET64 $eax, debug-location !17
 
 ...
+---
+name:            f
+alignment:       16
+tracksRegLiveness: true
+noPhis:          true
+isSSA:           false
+noVRegs:         true
+hasFakeUses:     false
+tracksDebugUserValues: true
+frameInfo:
+  stackSize:       8
+  offsetAdjustment: -8
+  maxAlignment:    1
+  maxCallFrameSize: 0
+  isCalleeSavedInfoValid: true
+fixedStack:
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+machineFunctionInfo:
+  amxProgModel:    None
+body:             |
+  bb.0.entry:
+    frame-setup PUSH64r killed $rbp, implicit-def $rsp, implicit $rsp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 16
+    frame-setup CFI_INSTRUCTION offset $rbp, -16
+    $rbp = frame-setup MOV64rr $rsp
+    frame-setup CFI_INSTRUCTION def_cfa_register $rbp
+
+  bb.1.do.body:
+
+...

From db6f627f3fd4072fe1814805653a352694527a91 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 26 Nov 2024 14:26:34 +0000
Subject: [PATCH 21/41] [clang][SME] Ignore flatten/clang::always_inline
 statements for callees with mismatched streaming attributes (#116391)

If `__attribute__((flatten))` is used on a function, or
`[[clang::always_inline]]` on a statement, don't inline any callees with
incompatible streaming attributes. Without this check, clang may produce
incorrect code when these attributes are used in code with streaming
functions.

Note: The docs for flatten say it can be ignored when inlining is
impossible: "causes calls within the attributed function to be inlined
unless it is impossible to do so".

Similarly, the (clang-only) `[[clang::always_inline]]` statement
attribute is more relaxed than the GNU `__attribute__((always_inline))`
(which says it should error it if it can't inline), saying only "If a
statement is marked [[clang::always_inline]] and contains calls, the
compiler attempts to inline those calls.". The docs also go on to show
an example of where `[[clang::always_inline]]` has no effect.
---
 clang/lib/CodeGen/CGCall.cpp                  | 16 ++--
 clang/lib/CodeGen/TargetInfo.h                | 18 ++++
 clang/lib/CodeGen/Targets/AArch64.cpp         | 72 +++++++++++++---
 .../sme-inline-callees-streaming-attrs.c      | 84 +++++++++++++++++++
 4 files changed, 172 insertions(+), 18 deletions(-)
 create mode 100644 clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c

diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 35d495d4dfab82..20455dbb820914 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5111,9 +5111,10 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
 
   // Some architectures (such as x86-64) have the ABI changed based on
   // attribute-target/features. Give them a chance to diagnose.
-  CGM.getTargetCodeGenInfo().checkFunctionCallABI(
-      CGM, Loc, dyn_cast_or_null<FunctionDecl>(CurCodeDecl),
-      dyn_cast_or_null<FunctionDecl>(TargetDecl), CallArgs, RetTy);
+  const FunctionDecl *CallerDecl = dyn_cast_or_null<FunctionDecl>(CurCodeDecl);
+  const FunctionDecl *CalleeDecl = dyn_cast_or_null<FunctionDecl>(TargetDecl);
+  CGM.getTargetCodeGenInfo().checkFunctionCallABI(CGM, Loc, CallerDecl,
+                                                  CalleeDecl, CallArgs, RetTy);
 
   // 1. Set up the arguments.
 
@@ -5688,7 +5689,10 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
     Attrs = Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::NoInline);
 
   // Add call-site always_inline attribute if exists.
-  if (InAlwaysInlineAttributedStmt)
+  // Note: This corresponds to the [[clang::always_inline]] statement attribute.
+  if (InAlwaysInlineAttributedStmt &&
+      !CGM.getTargetCodeGenInfo().wouldInliningViolateFunctionCallABI(
+          CallerDecl, CalleeDecl))
     Attrs =
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
 
@@ -5704,7 +5708,9 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
   // FIXME: should this really take priority over __try, below?
   if (CurCodeDecl && CurCodeDecl->hasAttr<FlattenAttr>() &&
       !InNoInlineAttributedStmt &&
-      !(TargetDecl && TargetDecl->hasAttr<NoInlineAttr>())) {
+      !(TargetDecl && TargetDecl->hasAttr<NoInlineAttr>()) &&
+      !CGM.getTargetCodeGenInfo().wouldInliningViolateFunctionCallABI(
+          CallerDecl, CalleeDecl)) {
     Attrs =
         Attrs.addFnAttribute(getLLVMContext(), llvm::Attribute::AlwaysInline);
   }
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 373f8b8a80fdb1..ab3142bdea684e 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -98,6 +98,24 @@ class TargetCodeGenInfo {
                                     const CallArgList &Args,
                                     QualType ReturnType) const {}
 
+  /// Returns true if inlining the function call would produce incorrect code
+  /// for the current target and should be ignored (even with the always_inline
+  /// or flatten attributes).
+  ///
+  /// Note: This probably should be handled in LLVM. However, the LLVM
+  /// `alwaysinline` attribute currently means the inliner will ignore
+  /// mismatched attributes (which sometimes can generate invalid code). So,
+  /// this hook allows targets to avoid adding the LLVM `alwaysinline` attribute
+  /// based on C/C++ attributes or other target-specific reasons.
+  ///
+  /// See previous discussion here:
+  /// https://discourse.llvm.org/t/rfc-avoid-inlining-alwaysinline-functions-when-they-cannot-be-inlined/79528
+  virtual bool
+  wouldInliningViolateFunctionCallABI(const FunctionDecl *Caller,
+                                      const FunctionDecl *Callee) const {
+    return false;
+  }
+
   /// Determines the size of struct _Unwind_Exception on this platform,
   /// in 8-bit units.  The Itanium ABI defines this as:
   ///   struct _Unwind_Exception {
diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
index 9320c6ef06efab..be33e26f047841 100644
--- a/clang/lib/CodeGen/Targets/AArch64.cpp
+++ b/clang/lib/CodeGen/Targets/AArch64.cpp
@@ -177,6 +177,9 @@ class AArch64TargetCodeGenInfo : public TargetCodeGenInfo {
                             const FunctionDecl *Callee, const CallArgList &Args,
                             QualType ReturnType) const override;
 
+  bool wouldInliningViolateFunctionCallABI(
+      const FunctionDecl *Caller, const FunctionDecl *Callee) const override;
+
 private:
   // Diagnose calls between functions with incompatible Streaming SVE
   // attributes.
@@ -1143,12 +1146,22 @@ void AArch64TargetCodeGenInfo::checkFunctionABI(
   }
 }
 
-void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
-    CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
-    const FunctionDecl *Callee) const {
-  if (!Caller || !Callee || !Callee->hasAttr<AlwaysInlineAttr>())
-    return;
+enum class ArmSMEInlinability : uint8_t {
+  Ok = 0,
+  ErrorCalleeRequiresNewZA = 1 << 0,
+  WarnIncompatibleStreamingModes = 1 << 1,
+  ErrorIncompatibleStreamingModes = 1 << 2,
+
+  IncompatibleStreamingModes =
+      WarnIncompatibleStreamingModes | ErrorIncompatibleStreamingModes,
 
+  LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/ErrorIncompatibleStreamingModes),
+};
+
+/// Determines if there are any Arm SME ABI issues with inlining \p Callee into
+/// \p Caller. Returns the issue (if any) in the ArmSMEInlinability bit enum.
+static ArmSMEInlinability GetArmSMEInlinability(const FunctionDecl *Caller,
+                                                const FunctionDecl *Callee) {
   bool CallerIsStreaming =
       IsArmStreamingFunction(Caller, /*IncludeLocallyStreaming=*/true);
   bool CalleeIsStreaming =
@@ -1156,17 +1169,44 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
   bool CallerIsStreamingCompatible = isStreamingCompatible(Caller);
   bool CalleeIsStreamingCompatible = isStreamingCompatible(Callee);
 
+  ArmSMEInlinability Inlinability = ArmSMEInlinability::Ok;
+
   if (!CalleeIsStreamingCompatible &&
-      (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible))
-    CGM.getDiags().Report(
-        CallLoc, CalleeIsStreaming
-                     ? diag::err_function_always_inline_attribute_mismatch
-                     : diag::warn_function_always_inline_attribute_mismatch)
-        << Caller->getDeclName() << Callee->getDeclName() << "streaming";
+      (CallerIsStreaming != CalleeIsStreaming || CallerIsStreamingCompatible)) {
+    if (CalleeIsStreaming)
+      Inlinability |= ArmSMEInlinability::ErrorIncompatibleStreamingModes;
+    else
+      Inlinability |= ArmSMEInlinability::WarnIncompatibleStreamingModes;
+  }
   if (auto *NewAttr = Callee->getAttr<ArmNewAttr>())
     if (NewAttr->isNewZA())
-      CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za)
-          << Callee->getDeclName();
+      Inlinability |= ArmSMEInlinability::ErrorCalleeRequiresNewZA;
+
+  return Inlinability;
+}
+
+void AArch64TargetCodeGenInfo::checkFunctionCallABIStreaming(
+    CodeGenModule &CGM, SourceLocation CallLoc, const FunctionDecl *Caller,
+    const FunctionDecl *Callee) const {
+  if (!Caller || !Callee || !Callee->hasAttr<AlwaysInlineAttr>())
+    return;
+
+  ArmSMEInlinability Inlinability = GetArmSMEInlinability(Caller, Callee);
+
+  if ((Inlinability & ArmSMEInlinability::IncompatibleStreamingModes) !=
+      ArmSMEInlinability::Ok)
+    CGM.getDiags().Report(
+        CallLoc,
+        (Inlinability & ArmSMEInlinability::ErrorIncompatibleStreamingModes) ==
+                ArmSMEInlinability::ErrorIncompatibleStreamingModes
+            ? diag::err_function_always_inline_attribute_mismatch
+            : diag::warn_function_always_inline_attribute_mismatch)
+        << Caller->getDeclName() << Callee->getDeclName() << "streaming";
+
+  if ((Inlinability & ArmSMEInlinability::ErrorCalleeRequiresNewZA) ==
+      ArmSMEInlinability::ErrorCalleeRequiresNewZA)
+    CGM.getDiags().Report(CallLoc, diag::err_function_always_inline_new_za)
+        << Callee->getDeclName();
 }
 
 // If the target does not have floating-point registers, but we are using a
@@ -1200,6 +1240,12 @@ void AArch64TargetCodeGenInfo::checkFunctionCallABI(CodeGenModule &CGM,
   checkFunctionCallABISoftFloat(CGM, CallLoc, Caller, Callee, Args, ReturnType);
 }
 
+bool AArch64TargetCodeGenInfo::wouldInliningViolateFunctionCallABI(
+    const FunctionDecl *Caller, const FunctionDecl *Callee) const {
+  return Caller && Callee &&
+         GetArmSMEInlinability(Caller, Callee) != ArmSMEInlinability::Ok;
+}
+
 void AArch64ABIInfo::appendAttributeMangling(TargetClonesAttr *Attr,
                                              unsigned Index,
                                              raw_ostream &Out) const {
diff --git a/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c b/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
new file mode 100644
index 00000000000000..ce6f203631fc5c
--- /dev/null
+++ b/clang/test/CodeGen/AArch64/sme-inline-callees-streaming-attrs.c
@@ -0,0 +1,84 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme %s -DUSE_FLATTEN -o - | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -emit-llvm -target-feature +sme %s -DUSE_ALWAYS_INLINE_STMT -o - | FileCheck %s
+
+// REQUIRES: aarch64-registered-target
+
+extern void was_inlined(void);
+
+#if defined(USE_FLATTEN)
+    #define FN_ATTR __attribute__((flatten))
+    #define STMT_ATTR
+#elif defined(USE_ALWAYS_INLINE_STMT)
+    #define FN_ATTR
+    #define STMT_ATTR [[clang::always_inline]]
+#else
+    #error Expected USE_FLATTEN or USE_ALWAYS_INLINE_STMT to be defined.
+#endif
+
+void fn(void) { was_inlined(); }
+void fn_streaming_compatible(void) __arm_streaming_compatible { was_inlined(); }
+void fn_streaming(void) __arm_streaming { was_inlined(); }
+__arm_locally_streaming void fn_locally_streaming(void) { was_inlined(); }
+__arm_new("za") void fn_streaming_new_za(void) __arm_streaming { was_inlined(); }
+
+FN_ATTR
+void caller(void) {
+    STMT_ATTR fn();
+    STMT_ATTR fn_streaming_compatible();
+    STMT_ATTR fn_streaming();
+    STMT_ATTR fn_locally_streaming();
+    STMT_ATTR fn_streaming_new_za();
+}
+// CHECK-LABEL: void @caller()
+//  CHECK-NEXT: entry:
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @fn_streaming
+//  CHECK-NEXT:   call void @fn_locally_streaming
+//  CHECK-NEXT:   call void @fn_streaming_new_za
+
+FN_ATTR void caller_streaming_compatible(void) __arm_streaming_compatible {
+    STMT_ATTR fn();
+    STMT_ATTR fn_streaming_compatible();
+    STMT_ATTR fn_streaming();
+    STMT_ATTR fn_locally_streaming();
+    STMT_ATTR fn_streaming_new_za();
+}
+// CHECK-LABEL: void @caller_streaming_compatible()
+//  CHECK-NEXT: entry:
+//  CHECK-NEXT:   call void @fn
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @fn_streaming
+//  CHECK-NEXT:   call void @fn_locally_streaming
+//  CHECK-NEXT:   call void @fn_streaming_new_za
+
+FN_ATTR void caller_streaming(void) __arm_streaming {
+    STMT_ATTR fn();
+    STMT_ATTR fn_streaming_compatible();
+    STMT_ATTR fn_streaming();
+    STMT_ATTR fn_locally_streaming();
+    STMT_ATTR fn_streaming_new_za();
+}
+// CHECK-LABEL: void @caller_streaming()
+//  CHECK-NEXT: entry:
+//  CHECK-NEXT:   call void @fn
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @fn_streaming_new_za
+
+FN_ATTR __arm_locally_streaming
+void caller_locally_streaming(void) {
+    STMT_ATTR fn();
+    STMT_ATTR fn_streaming_compatible();
+    STMT_ATTR fn_streaming();
+    STMT_ATTR fn_locally_streaming();
+    STMT_ATTR fn_streaming_new_za();
+}
+// CHECK-LABEL: void @caller_locally_streaming()
+//  CHECK-NEXT: entry:
+//  CHECK-NEXT:   call void @fn
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @was_inlined
+//  CHECK-NEXT:   call void @fn_streaming_new_za

From b214ca82daeece1568268ebc0fbcc2eaa649425b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 26 Nov 2024 14:59:39 +0000
Subject: [PATCH 22/41] [mlir][vector] Rename vector type TD definitions (nfc)
 (#117150)

Currently, the Vector dialect TD file includes the following "vector"
type definitions:

```mlir
def AnyVector : VectorOf<[AnyType]>;
def AnyVectorOfAnyRank : VectorOfAnyRankOf<[AnyType]>;
def AnyFixedVector : FixedVectorOf<[AnyType]>;
def AnyScalableVector : ScalableVectorOf<[AnyType]>;
```

In short:

  * `AnyVector` _excludes_ 0-D vectors.
  * `AnyVectorOfAnyRank`, `AnyFixedVector`, and `AnyScalableVector`
    _include_ 0-D vectors.

The naming for "groups" that include 0-D vectors is inconsistent and can
be misleading, and `AnyVector` implies that 0-D vectors are included,
which is not the case.

This patch renames these definitions for clarity:

```mlir
def AnyVectorOfNonZeroRank : VectorOfNonZeroRankOf<[AnyType]>;
def AnyVectorOfAnyRank : VectorOfAnyRankOf<[AnyType]>;
def AnyFixedVectorOfAnyRank : FixedVectorOfAnyRank<[AnyType]>;
def AnyScalableVectorOfAnyRank : ScalableVectorOfAnyRank<[AnyType]>;
```

Rationale:
* The updated names are more explicit about 0-D vector support.
* It becomes clearer that scalable vectors currently allow 0-D vectors -
  this might warrant a revisit.
* The renaming paves the way for adding a new group for "fixed-width
  vectors excluding 0-D vectors" (e.g., AnyFixedVector), which I plan to
  introduce in a follow-up patch.
---
 .../mlir/Dialect/Affine/IR/AffineOps.td       |  4 +-
 mlir/include/mlir/Dialect/ArmNeon/ArmNeon.td  |  2 +-
 .../mlir/Dialect/ArmSME/IR/ArmSMEOps.td       | 10 +--
 mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td | 54 ++++++------
 mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td   | 22 ++---
 .../mlir/Dialect/Tosa/IR/TosaTypesBase.td     |  2 +-
 .../mlir/Dialect/Vector/IR/VectorOps.td       | 84 ++++++++++---------
 mlir/include/mlir/IR/CommonTypeConstraints.td | 59 ++++++-------
 mlir/test/lib/Dialect/Test/TestOps.td         |  2 +-
 9 files changed, 121 insertions(+), 118 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 76d97f106dcb88..56fbe9cdc2d21d 100644
--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -964,7 +964,7 @@ def AffineVectorLoadOp : AffineLoadOpBase<"vector_load"> {
     (see [vector.transfer_read](../Vector/#vectortransfer_read-mlirvectortransferreadop)).
   }];
 
-  let results = (outs AnyVector:$result);
+  let results = (outs AnyVectorOfNonZeroRank:$result);
 
   let builders = [
     /// Builds an affine vector load op with the specified map and operands.
@@ -1031,7 +1031,7 @@ def AffineVectorStoreOp : AffineStoreOpBase<"vector_store"> {
     (see [vector.transfer_write](../Vector/#vectortransfer_write-mlirvectortransferwriteop)).
   }];
 
-  let arguments = (ins AnyVector:$value,
+  let arguments = (ins AnyVectorOfNonZeroRank:$value,
       Arg<AnyMemRef, "the reference to store to",
       [MemWrite]>:$memref,
       Variadic<Index>:$indices,
diff --git a/mlir/include/mlir/Dialect/ArmNeon/ArmNeon.td b/mlir/include/mlir/Dialect/ArmNeon/ArmNeon.td
index 9cc792093bf836..475b11f12c5f01 100644
--- a/mlir/include/mlir/Dialect/ArmNeon/ArmNeon.td
+++ b/mlir/include/mlir/Dialect/ArmNeon/ArmNeon.td
@@ -35,7 +35,7 @@ def ArmNeon_Dialect : Dialect {
 //===----------------------------------------------------------------------===//
 
 class NeonVectorOfLength<int length, Type elementType> : ShapedContainerType<
-  [elementType], And<[IsVectorOfShape<[length]>, IsFixedVectorTypePred]>,
+  [elementType], And<[IsVectorOfShape<[length]>, IsFixedVectorOfAnyRankTypePred]>,
   "a vector with length " # length,
   "::mlir::VectorType">;
 
diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
index 9a058ae4fe7647..6fd992afbf0436 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEOps.td
@@ -371,7 +371,7 @@ def TileLoadOp : ArmSME_Op<"tile_load", [
   let arguments = (ins
     Arg<AnyMemRef, "the reference to load from", [MemRead]>:$base,
     Variadic<Index>:$indices,
-    Optional<AnyType>:$padding, Optional<AnyVector>:$mask,
+    Optional<AnyType>:$padding, Optional<AnyVectorOfNonZeroRank>:$mask,
     ArmSME_TileSliceLayoutAttr:$layout
   );
   let results = (outs SMETile:$result);
@@ -444,7 +444,7 @@ def TileStoreOp : ArmSME_Op<"tile_store", [
   }];
   let arguments = (ins SMETile:$valueToStore,
     Arg<AnyMemRef, "the reference to store to", [MemWrite]>:$base,
-    Variadic<Index>:$indices, Optional<AnyVector>:$mask,
+    Variadic<Index>:$indices, Optional<AnyVectorOfNonZeroRank>:$mask,
     ArmSME_TileSliceLayoutAttr:$layout
   );
   let extraClassDeclaration = [{
@@ -799,9 +799,9 @@ class OuterProductWideningBase<string mnemonic,
   ]> {
 
   let arguments = (ins
-    AnyTypeOf<allowedInputVectorTypes>:$lhs, AnyVector:$rhs,
-    Optional<AnyVector>:$lhsMask, Optional<AnyVector>:$rhsMask,
-    Optional<AnyVector>:$acc);
+    AnyTypeOf<allowedInputVectorTypes>:$lhs, AnyVectorOfNonZeroRank:$rhs,
+    Optional<AnyVectorOfNonZeroRank>:$lhsMask, Optional<AnyVectorOfNonZeroRank>:$rhsMask,
+    Optional<AnyVectorOfNonZeroRank>:$acc);
   let results = (outs AnyTypeOf<allowedResultVectorTypes>:$result);
 
   let assemblyFormat = [{
diff --git a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
index d7e8b22fbd2d35..cdcf4d8752e874 100644
--- a/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
+++ b/mlir/include/mlir/Dialect/ArmSVE/IR/ArmSVE.td
@@ -100,11 +100,11 @@ class ScalableMaskedFOp<string mnemonic, string op_description,
     op_description # [{ on active lanes. Inactive lanes will keep the value of
     the first operand.}];
   let arguments = (ins
-          ScalableVectorOf<[I1]>:$mask,
-          ScalableVectorOf<[AnyFloat]>:$src1,
-          ScalableVectorOf<[AnyFloat]>:$src2
+          ScalableVectorOfAnyRank<[I1]>:$mask,
+          ScalableVectorOfAnyRank<[AnyFloat]>:$src1,
+          ScalableVectorOfAnyRank<[AnyFloat]>:$src2
   );
-  let results = (outs ScalableVectorOf<[AnyFloat]>:$res);
+  let results = (outs ScalableVectorOfAnyRank<[AnyFloat]>:$res);
   let assemblyFormat =
     "$mask `,` $src1 `,` $src2 attr-dict `:` type($mask) `,` type($res)";
 }
@@ -123,11 +123,11 @@ class ScalableMaskedIOp<string mnemonic, string op_description,
     op_description # [{ on active lanes. Inactive lanes will keep the value of
     the first operand.}];
   let arguments = (ins
-          ScalableVectorOf<[I1]>:$mask,
-          ScalableVectorOf<[I8, I16, I32, I64]>:$src1,
-          ScalableVectorOf<[I8, I16, I32, I64]>:$src2
+          ScalableVectorOfAnyRank<[I1]>:$mask,
+          ScalableVectorOfAnyRank<[I8, I16, I32, I64]>:$src1,
+          ScalableVectorOfAnyRank<[I8, I16, I32, I64]>:$src2
   );
-  let results = (outs ScalableVectorOf<[I8, I16, I32, I64]>:$res);
+  let results = (outs ScalableVectorOfAnyRank<[I8, I16, I32, I64]>:$res);
   let assemblyFormat =
     "$mask `,` $src1 `,` $src2 attr-dict `:` type($mask) `,` type($res)";
 }
@@ -511,55 +511,55 @@ def ScalableMaskedDivFOp : ScalableMaskedFOp<"masked.divf", "division">;
 
 def UmmlaIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"ummla">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def SmmlaIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"smmla">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def SdotIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"sdot">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def UdotIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"udot">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedAddIIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"add">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedAddFIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"fadd">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedMulIIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"mul">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedMulFIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"fmul">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedSubIIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"sub">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedSubFIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"fsub">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedSDivIIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"sdiv">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedUDivIIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"udiv">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ScalableMaskedDivFIntrOp :
   ArmSVE_IntrBinaryOverloadedOp<"fdiv">,
-  Arguments<(ins AnyScalableVector, AnyScalableVector, AnyScalableVector)>;
+  Arguments<(ins AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank, AnyScalableVectorOfAnyRank)>;
 
 def ConvertFromSvboolIntrOp :
   ArmSVE_IntrOp<"convert.from.svbool",
@@ -581,8 +581,8 @@ def ZipX2IntrOp : ArmSVE_IntrOp<"zip.x2",
     /*overloadedOperands=*/[0],
     /*overloadedResults=*/[],
     /*numResults=*/2>,
-    Arguments<(ins Arg<AnyScalableVector, "v1">:$v1,
-                   Arg<AnyScalableVector, "v2">:$v2)>;
+    Arguments<(ins Arg<AnyScalableVectorOfAnyRank, "v1">:$v1,
+                   Arg<AnyScalableVectorOfAnyRank, "v2">:$v2)>;
 
 // Note: This multi-vector intrinsic requires SME2.
 def ZipX4IntrOp : ArmSVE_IntrOp<"zip.x4",
@@ -590,10 +590,10 @@ def ZipX4IntrOp : ArmSVE_IntrOp<"zip.x4",
     /*overloadedOperands=*/[0],
     /*overloadedResults=*/[],
     /*numResults=*/4>,
-    Arguments<(ins Arg<AnyScalableVector, "v1">:$v1,
-                   Arg<AnyScalableVector, "v2">:$v2,
-                   Arg<AnyScalableVector, "v3">:$v3,
-                   Arg<AnyScalableVector, "v3">:$v4)>;
+    Arguments<(ins Arg<AnyScalableVectorOfAnyRank, "v1">:$v1,
+                   Arg<AnyScalableVectorOfAnyRank, "v2">:$v2,
+                   Arg<AnyScalableVectorOfAnyRank, "v3">:$v3,
+                   Arg<AnyScalableVectorOfAnyRank, "v3">:$v4)>;
 
 // Note: This intrinsic requires SME or SVE2.1.
 def PselIntrOp : ArmSVE_IntrOp<"psel",
diff --git a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
index 1f52f6b91617c1..b39f2ee594cd4a 100644
--- a/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
+++ b/mlir/include/mlir/Dialect/NVGPU/IR/NVGPU.td
@@ -255,7 +255,7 @@ def NVGPU_LdMatrixOp : NVGPU_Op<"ldmatrix", [
   let arguments = (ins Arg<AnyMemRef, "", [MemReadAt<0, FullEffect>]>:$srcMemref,
                            Variadic<Index>:$indices, BoolAttr:$transpose,
                            I32Attr:$numTiles);
-  let results = (outs AnyVector:$res);
+  let results = (outs AnyVectorOfNonZeroRank:$res);
   let assemblyFormat = [{
     $srcMemref`[` $indices `]` attr-dict `:` type($srcMemref) `->` type($res)
   }];
@@ -301,13 +301,13 @@ def NVGPU_MmaSyncOp : NVGPU_MmaSyncOp<"mma.sync"> {
         (vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf32>) -> vector<2x2xf32>
     ```
   }];
-  let arguments = (ins AnyVector:$matrixA,
-                       AnyVector:$matrixB,
-                       AnyVector:$matrixC,
+  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+                       AnyVectorOfNonZeroRank:$matrixB,
+                       AnyVectorOfNonZeroRank:$matrixC,
                        I64ArrayAttr:$mmaShape,
                        OptionalAttr<UnitAttr>:$tf32Enabled);
 
-  let results = (outs AnyVector:$res);
+  let results = (outs AnyVectorOfNonZeroRank:$res);
 
   let builders = [
     OpBuilder<(ins "Value":$matrixA,
@@ -357,16 +357,16 @@ def NVGPU_MmaSparseSyncOp : NVGPU_MmaSyncOp<"mma.sp.sync"> {
   ```
   }];
 
-  let arguments = (ins AnyVector:$matrixA,
-                       AnyVector:$matrixB,
-                       AnyVector:$matrixC,
+  let arguments = (ins AnyVectorOfNonZeroRank:$matrixA,
+                       AnyVectorOfNonZeroRank:$matrixB,
+                       AnyVectorOfNonZeroRank:$matrixC,
                        NVGPU_MmaSparseSyncMetadataType:$sparseMetadata,
                        I64ArrayAttr:$mmaShape,
                        DefaultValuedAttr<I32Attr, "0">:$sparsitySelector,
                        OptionalAttr<UnitAttr>:$tf32Enabled
                        );
 
-  let results = (outs AnyVector:$res);
+  let results = (outs AnyVectorOfNonZeroRank:$res);
 
   let builders = [
     OpBuilder<(ins "Value":$matrixA,
@@ -825,10 +825,10 @@ def NVGPU_RcpOp : NVGPU_Op<"rcp", [Pure,
 
     The input and output must be of the same vector type and shape.
   }];
-  let arguments = (ins VectorOf<[F32]>:$in,
+  let arguments = (ins VectorOfNonZeroRankOf<[F32]>:$in,
                        DefaultValuedAttr<RcpRoundingModeAttr, "RcpRoundingMode::APPROX">:$rounding,
                        UnitAttr:$ftz);
-  let results = (outs VectorOf<[F32]>:$out);
+  let results = (outs VectorOfNonZeroRankOf<[F32]>:$out);
   let assemblyFormat = [{
     $in `{` `rounding` `=` $rounding (`,` `ftz` $ftz^)? `}` 
     attr-dict `:` type($out)
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index a4b43d656fe43e..a6d3163d4446fa 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -166,7 +166,7 @@ def Tosa_Int32TensorUpto4D : AnyTypeOf<[
 
 class Tosa_TypeLike<list<Type> types, string description = ""> : TypeConstraint<Or<[
      AnyTypeOf<types>.predicate,
-     VectorOf<types>.predicate,
+     VectorOfNonZeroRankOf<types>.predicate,
      TosaTensorOf<types>.predicate]>,
      description>;
 
diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 41d7ce6610085c..88c1b94412241e 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -40,7 +40,7 @@ def Vector_ContractionOp :
       DeclareOpInterfaceMethods<MaskableOpInterface>,
       DeclareOpInterfaceMethods<VectorUnrollOpInterface, ["getShapeForUnroll"]>
     ]>,
-    Arguments<(ins AnyVector:$lhs, AnyVector:$rhs, AnyType:$acc,
+    Arguments<(ins AnyVectorOfNonZeroRank:$lhs, AnyVectorOfNonZeroRank:$rhs, AnyType:$acc,
                ArrayAttr:$indexing_maps,
                Vector_IteratorTypeArrayAttr:$iterator_types,
                DefaultValuedAttr<Vector_CombiningKindAttr,
@@ -285,7 +285,7 @@ def Vector_MultiDimReductionOp :
      DeclareOpInterfaceMethods<VectorUnrollOpInterface,
                                ["getShapeForUnroll"]>]>,
     Arguments<(ins Vector_CombiningKindAttr:$kind,
-                   AnyVector:$source,
+                   AnyVectorOfNonZeroRank:$source,
                    AnyType:$acc,
                    DenseI64ArrayAttr:$reduction_dims)>,
     Results<(outs AnyType:$dest)> {
@@ -417,16 +417,18 @@ def Vector_BroadcastOp :
   let hasVerifier = 1;
 }
 
-def Vector_ShuffleOp :
-  Vector_Op<"shuffle", [Pure,
-     PredOpTrait<"first operand v1 and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 0>>,
-     PredOpTrait<"second operand v2 and result have same element type",
-                 TCresVTEtIsSameAsOpBase<0, 1>>,
-     InferTypeOpAdaptor]>,
-     Arguments<(ins AnyFixedVector:$v1, AnyFixedVector:$v2,
-                    DenseI64ArrayAttr:$mask)>,
-     Results<(outs AnyVector:$vector)> {
+def Vector_ShuffleOp
+    : Vector_Op<
+          "shuffle",
+          [Pure,
+           PredOpTrait<"first operand v1 and result have same element type",
+                       TCresVTEtIsSameAsOpBase<0, 0>>,
+           PredOpTrait<"second operand v2 and result have same element type",
+                       TCresVTEtIsSameAsOpBase<0, 1>>,
+           InferTypeOpAdaptor]>,
+      Arguments<(ins AnyFixedVectorOfAnyRank:$v1, AnyFixedVectorOfAnyRank:$v2,
+          DenseI64ArrayAttr:$mask)>,
+      Results<(outs AnyVectorOfNonZeroRank:$vector)> {
   let summary = "shuffle operation";
   let description = [{
     The shuffle operation constructs a permutation (or duplication) of elements
@@ -531,7 +533,7 @@ def Vector_InterleaveOp :
   }];
 
   let arguments = (ins AnyVectorOfAnyRank:$lhs, AnyVectorOfAnyRank:$rhs);
-  let results = (outs AnyVector:$result);
+  let results = (outs AnyVectorOfNonZeroRank:$result);
 
   let assemblyFormat = [{
     $lhs `,` $rhs  attr-dict `:` type($lhs) `->` type($result)
@@ -610,8 +612,8 @@ def Vector_DeinterleaveOp :
         ```
       }];
 
-      let arguments = (ins AnyVector:$source);
-      let results = (outs AnyVector:$res1, AnyVector:$res2);
+      let arguments = (ins AnyVectorOfNonZeroRank:$source);
+      let results = (outs AnyVectorOfNonZeroRank:$res1, AnyVectorOfNonZeroRank:$res2);
 
       let assemblyFormat = [{
         $source attr-dict `:` type($source) `->` type($res1)
@@ -1048,9 +1050,9 @@ def Vector_InsertStridedSliceOp :
     PredOpTrait<"operand #0 and result have same element type",
                  TCresVTEtIsSameAsOpBase<0, 0>>,
     AllTypesMatch<["dest", "res"]>]>,
-    Arguments<(ins AnyVector:$source, AnyVector:$dest, I64ArrayAttr:$offsets,
+    Arguments<(ins AnyVectorOfNonZeroRank:$source, AnyVectorOfNonZeroRank:$dest, I64ArrayAttr:$offsets,
                I64ArrayAttr:$strides)>,
-    Results<(outs AnyVector:$res)> {
+    Results<(outs AnyVectorOfNonZeroRank:$res)> {
   let summary = "strided_slice operation";
   let description = [{
     Takes a k-D source vector, an n-D destination vector (n >= k), n-sized
@@ -1107,10 +1109,10 @@ def Vector_OuterProductOp :
     PredOpTrait<"rhs operand and result have same element type",
                 TCresVTEtIsSameAsOpBase<0, 1>>,
     DeclareOpInterfaceMethods<MaskableOpInterface>]>,
-    Arguments<(ins AnyVector:$lhs, AnyType:$rhs,
-               Optional<AnyVector>:$acc,
+    Arguments<(ins AnyVectorOfNonZeroRank:$lhs, AnyType:$rhs,
+               Optional<AnyVectorOfNonZeroRank>:$acc,
                DefaultValuedAttr<Vector_CombiningKindAttr, "CombiningKind::ADD">:$kind)>,
-    Results<(outs AnyVector)> {
+    Results<(outs AnyVectorOfNonZeroRank)> {
   let summary = "vector outerproduct with optional fused add";
   let description = [{
     Takes 2 1-D vectors and returns the 2-D vector containing the outer-product,
@@ -1190,9 +1192,9 @@ def Vector_ExtractStridedSliceOp :
   Vector_Op<"extract_strided_slice", [Pure,
     PredOpTrait<"operand and result have same element type",
                  TCresVTEtIsSameAsOpBase<0, 0>>]>,
-    Arguments<(ins AnyVector:$vector, I64ArrayAttr:$offsets,
+    Arguments<(ins AnyVectorOfNonZeroRank:$vector, I64ArrayAttr:$offsets,
                I64ArrayAttr:$sizes, I64ArrayAttr:$strides)>,
-    Results<(outs AnyVector)> {
+    Results<(outs AnyVectorOfNonZeroRank)> {
   let summary = "extract_strided_slice operation";
   let description = [{
     Takes an n-D vector, k-D `offsets` integer array attribute, a k-sized
@@ -1254,7 +1256,7 @@ def Vector_TransferReadOp :
                    Variadic<Index>:$indices,
                    AffineMapAttr:$permutation_map,
                    AnyType:$padding,
-                   Optional<VectorOf<[I1]>>:$mask,
+                   Optional<VectorOfNonZeroRankOf<[I1]>>:$mask,
                    BoolArrayAttr:$in_bounds)>,
     Results<(outs AnyVectorOfAnyRank:$vector)> {
 
@@ -1502,7 +1504,7 @@ def Vector_TransferWriteOp :
                    AnyShaped:$source,
                    Variadic<Index>:$indices,
                    AffineMapAttr:$permutation_map,
-                   Optional<VectorOf<[I1]>>:$mask,
+                   Optional<VectorOfNonZeroRankOf<[I1]>>:$mask,
                    BoolArrayAttr:$in_bounds)>,
     Results<(outs Optional<AnyRankedTensor>:$result)> {
 
@@ -1825,9 +1827,9 @@ def Vector_MaskedLoadOp :
   Vector_Op<"maskedload">,
     Arguments<(ins Arg<AnyMemRef, "", [MemRead]>:$base,
                Variadic<Index>:$indices,
-               VectorOf<[I1]>:$mask,
-               AnyVector:$pass_thru)>,
-    Results<(outs AnyVector:$result)> {
+               VectorOfNonZeroRankOf<[I1]>:$mask,
+               AnyVectorOfNonZeroRank:$pass_thru)>,
+    Results<(outs AnyVectorOfNonZeroRank:$result)> {
 
   let summary = "loads elements from memory into a vector as defined by a mask vector";
 
@@ -1888,8 +1890,8 @@ def Vector_MaskedStoreOp :
   Vector_Op<"maskedstore">,
     Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
                Variadic<Index>:$indices,
-               VectorOf<[I1]>:$mask,
-               AnyVector:$valueToStore)> {
+               VectorOfNonZeroRankOf<[I1]>:$mask,
+               AnyVectorOfNonZeroRank:$valueToStore)> {
 
   let summary = "stores elements from a vector into memory as defined by a mask vector";
 
@@ -1951,10 +1953,10 @@ def Vector_GatherOp :
   ]>,
     Arguments<(ins Arg<AnyShaped, "", [MemRead]>:$base,
                Variadic<Index>:$indices,
-               VectorOf<[AnyInteger, Index]>:$index_vec,
-               VectorOf<[I1]>:$mask,
-               AnyVector:$pass_thru)>,
-    Results<(outs AnyVector:$result)> {
+               VectorOfNonZeroRankOf<[AnyInteger, Index]>:$index_vec,
+               VectorOfNonZeroRankOf<[I1]>:$mask,
+               AnyVectorOfNonZeroRank:$pass_thru)>,
+    Results<(outs AnyVectorOfNonZeroRank:$result)> {
 
   let summary = [{
     gathers elements from memory or ranked tensor into a vector as defined by an
@@ -2082,9 +2084,9 @@ def Vector_ExpandLoadOp :
   Vector_Op<"expandload">,
     Arguments<(ins Arg<AnyMemRef, "", [MemRead]>:$base,
                Variadic<Index>:$indices,
-               VectorOf<[I1]>:$mask,
-               AnyVector:$pass_thru)>,
-    Results<(outs AnyVector:$result)> {
+               VectorOfNonZeroRankOf<[I1]>:$mask,
+               AnyVectorOfNonZeroRank:$pass_thru)>,
+    Results<(outs AnyVectorOfNonZeroRank:$result)> {
 
   let summary = "reads elements from memory and spreads them into a vector as defined by a mask";
 
@@ -2149,8 +2151,8 @@ def Vector_CompressStoreOp :
   Vector_Op<"compressstore">,
     Arguments<(ins Arg<AnyMemRef, "", [MemWrite]>:$base,
                Variadic<Index>:$indices,
-               VectorOf<[I1]>:$mask,
-               AnyVector:$valueToStore)> {
+               VectorOfNonZeroRankOf<[I1]>:$mask,
+               AnyVectorOfNonZeroRank:$valueToStore)> {
 
   let summary = "writes elements selectively from a vector as defined by a mask";
 
@@ -2508,7 +2510,7 @@ def Vector_MaskOp : Vector_Op<"mask", [
   }];
 
   // TODO: Support multiple passthru values.
-  let arguments = (ins VectorOf<[I1]>:$mask,
+  let arguments = (ins VectorOfNonZeroRankOf<[I1]>:$mask,
                    Optional<AnyType>:$passthru);
   let results = (outs Variadic<AnyType>:$results);
   let regions = (region SizedRegion<1>:$maskRegion);
@@ -2891,11 +2893,11 @@ def Vector_ScanOp :
     AllTypesMatch<["source", "dest"]>,
     AllTypesMatch<["initial_value", "accumulated_value"]> ]>,
     Arguments<(ins Vector_CombiningKindAttr:$kind,
-                   AnyVector:$source,
+                   AnyVectorOfNonZeroRank:$source,
                    AnyVectorOfAnyRank:$initial_value,
                    I64Attr:$reduction_dim,
                    BoolAttr:$inclusive)>,
-    Results<(outs AnyVector:$dest,
+    Results<(outs AnyVectorOfNonZeroRank:$dest,
                   AnyVectorOfAnyRank:$accumulated_value)> {
   let summary = "Scan operation";
   let description = [{
diff --git a/mlir/include/mlir/IR/CommonTypeConstraints.td b/mlir/include/mlir/IR/CommonTypeConstraints.td
index 48e4c24f838652..fc4383d08422cb 100644
--- a/mlir/include/mlir/IR/CommonTypeConstraints.td
+++ b/mlir/include/mlir/IR/CommonTypeConstraints.td
@@ -22,15 +22,15 @@ include "mlir/IR/DialectBase.td"
 
 // Whether a type is a VectorType.
 // Explicitly disallow 0-D vectors for now until we have good enough coverage.
-def IsVectorTypePred : And<[CPred<"::llvm::isa<::mlir::VectorType>($_self)">,
-                            CPred<"::llvm::cast<::mlir::VectorType>($_self).getRank() > 0">]>;
+def IsVectorOfNonZeroRankTypePred : And<[CPred<"::llvm::isa<::mlir::VectorType>($_self)">,
+                                         CPred<"::llvm::cast<::mlir::VectorType>($_self).getRank() > 0">]>;
 
 // Temporary vector type clone that allows gradual transition to 0-D vectors.
 // TODO: Remove this when all ops support 0-D vectors.
 def IsVectorOfAnyRankTypePred : CPred<"::llvm::isa<::mlir::VectorType>($_self)">;
 
 // Whether a type is a fixed-length VectorType.
-def IsFixedVectorTypePred : CPred<[{::llvm::isa<::mlir::VectorType>($_self) &&
+def IsFixedVectorOfAnyRankTypePred : CPred<[{::llvm::isa<::mlir::VectorType>($_self) &&
                                   !::llvm::cast<VectorType>($_self).isScalable()}]>;
 
 // Whether a type is a scalable VectorType.
@@ -53,7 +53,7 @@ def IsVectorTypeWithOnlyTrailingDimScalablePred : And<[
 
 // Whether a type is a VectorType and all dimensions are scalable.
 def IsVectorTypeWithAllDimsScalablePred : And<[
-  IsVectorTypePred,
+  IsVectorOfNonZeroRankTypePred,
   CPred<[{::llvm::cast<::mlir::VectorType>($_self).allDimsScalable()}]>
 ]>;
 
@@ -428,8 +428,8 @@ class ValueSemanticsContainerOf<list<Type> allowedTypes> :
 
 // Vector types.
 
-class VectorOf<list<Type> allowedTypes> :
-  ShapedContainerType<allowedTypes, IsVectorTypePred, "vector",
+class VectorOfNonZeroRankOf<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, IsVectorOfNonZeroRankTypePred, "vector",
                       "::mlir::VectorType">;
 
 // Temporary vector type clone that allows gradual transition to 0-D vectors.
@@ -438,11 +438,11 @@ class VectorOfAnyRankOf<list<Type> allowedTypes> :
   ShapedContainerType<allowedTypes, IsVectorOfAnyRankTypePred, "vector",
                       "::mlir::VectorType">;
 
-class FixedVectorOf<list<Type> allowedTypes> :
-  ShapedContainerType<allowedTypes, IsFixedVectorTypePred,
+class FixedVectorOfAnyRank<list<Type> allowedTypes> :
+  ShapedContainerType<allowedTypes, IsFixedVectorOfAnyRankTypePred,
           "fixed-length vector", "::mlir::VectorType">;
 
-class ScalableVectorOf<list<Type> allowedTypes> :
+class ScalableVectorOfAnyRank<list<Type> allowedTypes> :
   ShapedContainerType<allowedTypes, IsVectorTypeWithAnyDimScalablePred,
           "scalable vector", "::mlir::VectorType">;
 
@@ -458,7 +458,7 @@ class VectorWithTrailingDimScalableOf<list<Type> allowedTypes> :
 // Whether the number of elements of a vector is from the given
 // `allowedRanks` list
 class IsVectorOfRankPred<list<int> allowedRanks> :
-  And<[IsVectorTypePred,
+  And<[IsVectorOfNonZeroRankTypePred,
        Or<!foreach(allowedlength, allowedRanks,
                    CPred<[{::llvm::cast<::mlir::VectorType>($_self).getRank()
                            == }]
@@ -467,7 +467,7 @@ class IsVectorOfRankPred<list<int> allowedRanks> :
 // Whether the number of elements of a fixed-length vector is from the given
 // `allowedRanks` list
 class IsFixedVectorOfRankPred<list<int> allowedRanks> :
-  And<[IsFixedVectorTypePred,
+  And<[IsFixedVectorOfAnyRankTypePred,
        Or<!foreach(allowedlength, allowedRanks,
                    CPred<[{::llvm::cast<::mlir::VectorType>($_self).getRank()
                            == }]
@@ -501,22 +501,22 @@ class ScalableVectorOfRank<list<int> allowedRanks> : Type<
 // is from the given `allowedTypes` list
 class VectorOfRankAndType<list<int> allowedRanks,
                           list<Type> allowedTypes> : AllOfType<
-  [VectorOf<allowedTypes>, VectorOfRank<allowedRanks>],
-  VectorOf<allowedTypes>.summary # VectorOfRank<allowedRanks>.summary,
+  [VectorOfNonZeroRankOf<allowedTypes>, VectorOfRank<allowedRanks>],
+   VectorOfNonZeroRankOf<allowedTypes>.summary # VectorOfRank<allowedRanks>.summary,
   "::mlir::VectorType">;
 
 // Fixed-width vector where the rank is from the given `allowedRanks` list and
 // the type is from the given `allowedTypes` list
 class FixedVectorOfRankAndType<list<int> allowedRanks,
                           list<Type> allowedTypes> : AllOfType<
-  [FixedVectorOf<allowedTypes>, VectorOfRank<allowedRanks>],
-  FixedVectorOf<allowedTypes>.summary # VectorOfRank<allowedRanks>.summary,
+  [FixedVectorOfAnyRank<allowedTypes>, VectorOfRank<allowedRanks>],
+  FixedVectorOfAnyRank<allowedTypes>.summary # VectorOfRank<allowedRanks>.summary,
   "::mlir::VectorType">;
 
 // Whether the number of elements of a vector is from the given
 // `allowedLengths` list
 class IsVectorOfLengthPred<list<int> allowedLengths> :
-  And<[IsVectorTypePred,
+  And<[IsVectorOfNonZeroRankTypePred,
        Or<!foreach(allowedlength, allowedLengths,
                    CPred<[{::llvm::cast<::mlir::VectorType>($_self).getNumElements()
                            == }]
@@ -525,7 +525,7 @@ class IsVectorOfLengthPred<list<int> allowedLengths> :
 // Whether the number of elements of a fixed-length vector is from the given
 // `allowedLengths` list
 class IsFixedVectorOfLengthPred<list<int> allowedLengths> :
-  And<[IsFixedVectorTypePred,
+  And<[IsFixedVectorOfAnyRankTypePred,
        Or<!foreach(allowedlength, allowedLengths,
                    CPred<[{::llvm::cast<::mlir::VectorType>($_self).getNumElements()
                            == }]
@@ -604,16 +604,16 @@ class ScalableVectorOfLength<list<int> allowedLengths> : Type<
 // list
 class VectorOfLengthAndType<list<int> allowedLengths,
                             list<Type> allowedTypes> : AllOfType<
-  [VectorOf<allowedTypes>, VectorOfLength<allowedLengths>],
-  VectorOf<allowedTypes>.summary # VectorOfLength<allowedLengths>.summary,
+  [VectorOfNonZeroRankOf<allowedTypes>, VectorOfLength<allowedLengths>],
+   VectorOfNonZeroRankOf<allowedTypes>.summary # VectorOfLength<allowedLengths>.summary,
   "::mlir::VectorType">;
 
 // Any fixed-length vector where the number of elements is from the given
 // `allowedLengths` list and the type is from the given `allowedTypes` list
 class FixedVectorOfLengthAndType<list<int> allowedLengths,
                                  list<Type> allowedTypes> : AllOfType<
-  [FixedVectorOf<allowedTypes>, FixedVectorOfLength<allowedLengths>],
-  FixedVectorOf<allowedTypes>.summary #
+  [FixedVectorOfAnyRank<allowedTypes>, FixedVectorOfLength<allowedLengths>],
+  FixedVectorOfAnyRank<allowedTypes>.summary #
   FixedVectorOfLength<allowedLengths>.summary,
   "::mlir::VectorType">;
 
@@ -621,8 +621,8 @@ class FixedVectorOfLengthAndType<list<int> allowedLengths,
 // `allowedLengths` list and the type is from the given `allowedTypes` list
 class ScalableVectorOfLengthAndType<list<int> allowedLengths,
                                     list<Type> allowedTypes> : AllOfType<
-  [ScalableVectorOf<allowedTypes>, ScalableVectorOfLength<allowedLengths>],
-  ScalableVectorOf<allowedTypes>.summary #
+  [ScalableVectorOfAnyRank<allowedTypes>, ScalableVectorOfLength<allowedLengths>],
+  ScalableVectorOfAnyRank<allowedTypes>.summary #
   ScalableVectorOfLength<allowedLengths>.summary,
   "::mlir::VectorType">;
 
@@ -632,10 +632,10 @@ class ScalableVectorOfLengthAndType<list<int> allowedLengths,
 class ScalableVectorOfRankAndLengthAndType<list<int> allowedRanks,
                                            list<int> allowedLengths,
                                            list<Type> allowedTypes> : AllOfType<
-  [ScalableVectorOfRank<allowedRanks>, ScalableVectorOf<allowedTypes>,
+  [ScalableVectorOfRank<allowedRanks>, ScalableVectorOfAnyRank<allowedTypes>,
    ScalableVectorOfLength<allowedLengths>],
   ScalableVectorOfRank<allowedRanks>.summary #
-  ScalableVectorOf<allowedTypes>.summary #
+  ScalableVectorOfAnyRank<allowedTypes>.summary #
   ScalableVectorOfLength<allowedLengths>.summary,
   "::mlir::VectorType">;
 
@@ -657,13 +657,14 @@ class VectorWithTrailingDimScalableOfSizeAndType<list<int> allowedTrailingSizes,
    ShapedTypeWithNthDimOfSize<-1, allowedTrailingSizes>.summary,
   "::mlir::VectorType">;
 
-def AnyVector : VectorOf<[AnyType]>;
-// Temporary vector type clone that allows gradual transition to 0-D vectors.
+// Unlike the following definitions, this one excludes 0-D vectors
+def AnyVectorOfNonZeroRank : VectorOfNonZeroRankOf<[AnyType]>;
+
 def AnyVectorOfAnyRank : VectorOfAnyRankOf<[AnyType]>;
 
-def AnyFixedVector : FixedVectorOf<[AnyType]>;
+def AnyFixedVectorOfAnyRank : FixedVectorOfAnyRank<[AnyType]>;
 
-def AnyScalableVector : ScalableVectorOf<[AnyType]>;
+def AnyScalableVectorOfAnyRank : ScalableVectorOfAnyRank<[AnyType]>;
 
 // Shaped types.
 
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 6752113cab8d41..239d5292180269 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -2781,7 +2781,7 @@ def TestGraphLoopOp : TEST_Op<"graph_loop",
 //===----------------------------------------------------------------------===//
 // Test InferIntRangeInterface
 //===----------------------------------------------------------------------===//
-def InferIntRangeType : AnyTypeOf<[AnyInteger, Index, VectorOf<[AnyInteger, Index]>]>;
+def InferIntRangeType : AnyTypeOf<[AnyInteger, Index, VectorOfNonZeroRankOf<[AnyInteger, Index]>]>;
 
 def TestWithBoundsOp : TEST_Op<"with_bounds",
                           [DeclareOpInterfaceMethods<InferIntRangeInterface, ["inferResultRanges"]>,

From 7577284c4f3cb81a8ac648683bd3af292827391f Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Tue, 26 Nov 2024 07:34:04 -0800
Subject: [PATCH 23/41] [OpenACC][NFC] Update varlist-ast test to check
 serialization

I noticed while working on another test that I never used the PCH
trickery to get this to validate that serialization/deserialization
works correctly.  It DOES, but we weren't testing it with this test like
the others.
---
 .../test/SemaOpenACC/compute-construct-varlist-ast.cpp | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
index e057678d924957..1bfd4e8af64818 100644
--- a/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
+++ b/clang/test/SemaOpenACC/compute-construct-varlist-ast.cpp
@@ -1,5 +1,12 @@
 // RUN: %clang_cc1 %s -fopenacc -Wno-openacc-deprecated-clause-alias -ast-dump | FileCheck %s
 
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -Wno-openacc-deprecated-clause-alias -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -Wno-openacc-deprecated-clause-alias -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
 int Global;
 short GlobalArray[5];
 
@@ -435,7 +442,7 @@ void TemplUses(T t, U u, T*PointerParam) {
   // CHECK-NEXT: TemplateArgument type 'int'
   // CHECK-NEXT: BuiltinType{{.*}} 'int'
   // CHECK-NEXT: TemplateArgument type 'int[1]'
-  // CHECK-NEXT: ConstantArrayType{{.*}} 'int[1]' 1
+  // CHECK-NEXT: ConstantArrayType{{.*}} 'int[1]'{{.*}} 1
   // CHECK-NEXT: BuiltinType{{.*}} 'int'
   // CHECK-NEXT: ParmVarDecl{{.*}} used t 'int'
   // CHECK-NEXT: ParmVarDecl{{.*}} used u 'int *'
@@ -979,3 +986,4 @@ void Inst() {
   STempl<int> stempl;
   stempl.bar<int>();
 }
+#endif

From 80df56e03b0455382cec51557bfc9f099d5c0a6f Mon Sep 17 00:00:00 2001
From: Mark Goncharov <110403898+mga-sc@users.noreply.github.com>
Date: Tue, 26 Nov 2024 18:39:45 +0300
Subject: [PATCH 24/41] Reapply "[RISCV] Implement tail call optimization in
 machine outliner" (#117700)

This MR fixes failed test `CodeGen/RISCV/compress-opt-select.ll`.

It was failed due to previously merged commit `[TTI][RISCV]
Unconditionally break critical edges to sink ADDI (PR #108889)`.

So, regenerated `compress-opt-select` test.
---
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   6 +
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |   6 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 143 +++++++++++++----
 .../test/CodeGen/RISCV/compress-opt-select.ll | 144 ++++++------------
 .../CodeGen/RISCV/machine-outliner-call.ll    |  70 +++++++++
 .../CodeGen/RISCV/machine-outliner-cfi.mir    |  22 +--
 .../machine-outliner-leaf-descendants.ll      |  13 +-
 .../RISCV/machine-outliner-patchable.ll       |  24 ++-
 .../RISCV/machine-outliner-position.mir       |  21 ++-
 .../test/CodeGen/RISCV/machineoutliner-x5.mir |  54 +++++++
 llvm/test/CodeGen/RISCV/machineoutliner.mir   |  18 ++-
 11 files changed, 346 insertions(+), 175 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/machine-outliner-call.ll
 create mode 100644 llvm/test/CodeGen/RISCV/machineoutliner-x5.mir

diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 19103e219cb800..ca2f868cd4e764 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -208,6 +208,12 @@ static inline unsigned getVLOpNum(const MCInstrDesc &Desc) {
   return Desc.getNumOperands() - Offset;
 }
 
+static inline unsigned getTailExpandUseRegNo(const FeatureBitset &FeatureBits) {
+  // For Zicfilp, PseudoTAIL should be expanded to a software guarded branch.
+  // It means to use t2(x7) as rs1 of JALR to expand PseudoTAIL.
+  return FeatureBits[RISCV::FeatureStdExtZicfilp] ? RISCV::X7 : RISCV::X6;
+}
+
 static inline unsigned getSEWOpNum(const MCInstrDesc &Desc) {
   const uint64_t TSFlags = Desc.TSFlags;
   assert(hasSEWOp(TSFlags));
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 6bb49e2bb85fe1..a28bf1186589d9 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -124,11 +124,7 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI,
   MCRegister Ra;
   if (MI.getOpcode() == RISCV::PseudoTAIL) {
     Func = MI.getOperand(0);
-    Ra = RISCV::X6;
-    // For Zicfilp, PseudoTAIL should be expanded to a software guarded branch.
-    // It means to use t2(x7) as rs1 of JALR to expand PseudoTAIL.
-    if (STI.hasFeature(RISCV::FeatureStdExtZicfilp))
-      Ra = RISCV::X7;
+    Ra = RISCVII::getTailExpandUseRegNo(STI.getFeatureBits());
   } else if (MI.getOpcode() == RISCV::PseudoCALLReg) {
     Func = MI.getOperand(1);
     Ra = MI.getOperand(0).getReg();
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 933e776da47404..47273d6bc06d65 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVInstrInfo.h"
+#include "MCTargetDesc/RISCVBaseInfo.h"
 #include "MCTargetDesc/RISCVMatInt.h"
 #include "RISCV.h"
 #include "RISCVMachineFunctionInfo.h"
@@ -2927,6 +2928,7 @@ bool RISCVInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
 
 // Enum values indicating how an outlined call should be constructed.
 enum MachineOutlinerConstructionID {
+  MachineOutlinerTailCall,
   MachineOutlinerDefault
 };
 
@@ -2935,46 +2937,118 @@ bool RISCVInstrInfo::shouldOutlineFromFunctionByDefault(
   return MF.getFunction().hasMinSize();
 }
 
+static bool isCandidatePatchable(const MachineBasicBlock &MBB) {
+  const MachineFunction *MF = MBB.getParent();
+  const Function &F = MF->getFunction();
+  return F.getFnAttribute("fentry-call").getValueAsBool() ||
+         F.hasFnAttribute("patchable-function-entry");
+}
+
+static bool isMIReadsReg(const MachineInstr &MI, const TargetRegisterInfo *TRI,
+                         unsigned RegNo) {
+  return MI.readsRegister(RegNo, TRI) ||
+         MI.getDesc().hasImplicitUseOfPhysReg(RegNo);
+}
+
+static bool isMIModifiesReg(const MachineInstr &MI,
+                            const TargetRegisterInfo *TRI, unsigned RegNo) {
+  return MI.modifiesRegister(RegNo, TRI) ||
+         MI.getDesc().hasImplicitDefOfPhysReg(RegNo);
+}
+
+static bool cannotInsertTailCall(const MachineBasicBlock &MBB) {
+  if (!MBB.back().isReturn())
+    return true;
+  if (isCandidatePatchable(MBB))
+    return true;
+
+  // If the candidate reads the pre-set register
+  // that can be used for expanding PseudoTAIL instruction,
+  // then we cannot insert tail call.
+  const TargetSubtargetInfo &STI = MBB.getParent()->getSubtarget();
+  unsigned TailExpandUseRegNo =
+      RISCVII::getTailExpandUseRegNo(STI.getFeatureBits());
+  for (const MachineInstr &MI : MBB) {
+    if (isMIReadsReg(MI, STI.getRegisterInfo(), TailExpandUseRegNo))
+      return true;
+    if (isMIModifiesReg(MI, STI.getRegisterInfo(), TailExpandUseRegNo))
+      break;
+  }
+  return false;
+}
+
+static std::optional<MachineOutlinerConstructionID>
+analyzeCandidate(outliner::Candidate &C) {
+  // If last instruction is return then we can rely on
+  // the verification already performed in the getOutliningTypeImpl.
+  if (C.back().isReturn()) {
+    assert(!cannotInsertTailCall(*C.getMBB()) &&
+           "The candidate who uses return instruction must be outlined "
+           "using tail call");
+    return MachineOutlinerTailCall;
+  }
+
+  auto CandidateUsesX5 = [](outliner::Candidate &C) {
+    const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();
+    if (std::any_of(C.begin(), C.end(), [TRI](const MachineInstr &MI) {
+          return isMIModifiesReg(MI, TRI, RISCV::X5);
+        }))
+      return true;
+    return !C.isAvailableAcrossAndOutOfSeq(RISCV::X5, *TRI);
+  };
+
+  if (!CandidateUsesX5(C))
+    return MachineOutlinerDefault;
+
+  return std::nullopt;
+}
+
 std::optional<std::unique_ptr<outliner::OutlinedFunction>>
 RISCVInstrInfo::getOutliningCandidateInfo(
     const MachineModuleInfo &MMI,
     std::vector<outliner::Candidate> &RepeatedSequenceLocs,
     unsigned MinRepeats) const {
 
-  // First we need to filter out candidates where the X5 register (IE t0) can't
-  // be used to setup the function call.
-  auto CannotInsertCall = [](outliner::Candidate &C) {
-    const TargetRegisterInfo *TRI = C.getMF()->getSubtarget().getRegisterInfo();
-    return !C.isAvailableAcrossAndOutOfSeq(RISCV::X5, *TRI);
-  };
-
-  llvm::erase_if(RepeatedSequenceLocs, CannotInsertCall);
+  // Each RepeatedSequenceLoc is identical.
+  outliner::Candidate &Candidate = RepeatedSequenceLocs[0];
+  auto CandidateInfo = analyzeCandidate(Candidate);
+  if (!CandidateInfo)
+    RepeatedSequenceLocs.clear();
 
   // If the sequence doesn't have enough candidates left, then we're done.
   if (RepeatedSequenceLocs.size() < MinRepeats)
     return std::nullopt;
 
-  unsigned SequenceSize = 0;
-
-  for (auto &MI : RepeatedSequenceLocs[0])
-    SequenceSize += getInstSizeInBytes(MI);
+  unsigned InstrSizeCExt =
+      Candidate.getMF()->getSubtarget<RISCVSubtarget>().hasStdExtCOrZca() ? 2
+                                                                          : 4;
+  unsigned CallOverhead = 0, FrameOverhead = 0;
+
+  MachineOutlinerConstructionID MOCI = CandidateInfo.value();
+  switch (MOCI) {
+  case MachineOutlinerDefault:
+    // call t0, function = 8 bytes.
+    CallOverhead = 8;
+    // jr t0 = 4 bytes, 2 bytes if compressed instructions are enabled.
+    FrameOverhead = InstrSizeCExt;
+    break;
+  case MachineOutlinerTailCall:
+    // tail call = auipc + jalr in the worst case without linker relaxation.
+    CallOverhead = 4 + InstrSizeCExt;
+    // Using tail call we move ret instruction from caller to callee.
+    FrameOverhead = 0;
+    break;
+  }
 
-  // call t0, function = 8 bytes.
-  unsigned CallOverhead = 8;
   for (auto &C : RepeatedSequenceLocs)
-    C.setCallInfo(MachineOutlinerDefault, CallOverhead);
+    C.setCallInfo(MOCI, CallOverhead);
 
-  // jr t0 = 4 bytes, 2 bytes if compressed instructions are enabled.
-  unsigned FrameOverhead = 4;
-  if (RepeatedSequenceLocs[0]
-          .getMF()
-          ->getSubtarget<RISCVSubtarget>()
-          .hasStdExtCOrZca())
-    FrameOverhead = 2;
+  unsigned SequenceSize = 0;
+  for (auto &MI : Candidate)
+    SequenceSize += getInstSizeInBytes(MI);
 
   return std::make_unique<outliner::OutlinedFunction>(
-      RepeatedSequenceLocs, SequenceSize, FrameOverhead,
-      MachineOutlinerDefault);
+      RepeatedSequenceLocs, SequenceSize, FrameOverhead, MOCI);
 }
 
 outliner::InstrType
@@ -2995,15 +3069,8 @@ RISCVInstrInfo::getOutliningTypeImpl(const MachineModuleInfo &MMI,
     return F.needsUnwindTableEntry() ? outliner::InstrType::Illegal
                                      : outliner::InstrType::Invisible;
 
-  // We need support for tail calls to outlined functions before return
-  // statements can be allowed.
-  if (MI.isReturn())
-    return outliner::InstrType::Illegal;
-
-  // Don't allow modifying the X5 register which we use for return addresses for
-  // these outlined functions.
-  if (MI.modifiesRegister(RISCV::X5, TRI) ||
-      MI.getDesc().hasImplicitDefOfPhysReg(RISCV::X5))
+  if (cannotInsertTailCall(*MBB) &&
+      (MI.isReturn() || isMIModifiesReg(MI, TRI, RISCV::X5)))
     return outliner::InstrType::Illegal;
 
   // Make sure the operands don't reference something unsafe.
@@ -3039,6 +3106,9 @@ void RISCVInstrInfo::buildOutlinedFrame(
     }
   }
 
+  if (OF.FrameConstructionID == MachineOutlinerTailCall)
+    return;
+
   MBB.addLiveIn(RISCV::X5);
 
   // Add in a return instruction to the end of the outlined frame.
@@ -3052,6 +3122,13 @@ MachineBasicBlock::iterator RISCVInstrInfo::insertOutlinedCall(
     Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
     MachineFunction &MF, outliner::Candidate &C) const {
 
+  if (C.CallConstructionID == MachineOutlinerTailCall) {
+    It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(RISCV::PseudoTAIL))
+                            .addGlobalAddress(M.getNamedValue(MF.getName()),
+                                              /*Offset=*/0, RISCVII::MO_CALL));
+    return It;
+  }
+
   // Add in a call instruction to the outlined function at the given location.
   It = MBB.insert(It,
                   BuildMI(MF, DebugLoc(), get(RISCV::PseudoCALLReg), RISCV::X5)
diff --git a/llvm/test/CodeGen/RISCV/compress-opt-select.ll b/llvm/test/CodeGen/RISCV/compress-opt-select.ll
index f9333a45016a06..733c84ac236133 100644
--- a/llvm/test/CodeGen/RISCV/compress-opt-select.ll
+++ b/llvm/test/CodeGen/RISCV/compress-opt-select.ll
@@ -13,11 +13,9 @@ define i32 @ne_small_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, 20
 ; RV32IFDC-NEXT:    bne a0, a1, .LBB0_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB0_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_small_pos:
 ; RV32IFD:       # %bb.0:
@@ -41,11 +39,9 @@ define i32 @ne_small_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, -20
 ; RV32IFDC-NEXT:    bne a0, a1, .LBB1_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB1_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_small_neg:
 ; RV32IFD:       # %bb.0:
@@ -69,11 +65,9 @@ define i32 @ne_small_edge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, 31
 ; RV32IFDC-NEXT:    bne a0, a1, .LBB2_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB2_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_small_edge_pos:
 ; RV32IFD:       # %bb.0:
@@ -97,11 +91,9 @@ define i32 @ne_small_edge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, -32
 ; RV32IFDC-NEXT:    bne a0, a1, .LBB3_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB3_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_small_edge_neg:
 ; RV32IFD:       # %bb.0:
@@ -126,11 +118,9 @@ define i32 @ne_medium_ledge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, -33
 ; RV32IFDC-NEXT:    c.bnez a0, .LBB4_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB4_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_medium_ledge_pos:
 ; RV32IFD:       # %bb.0:
@@ -155,11 +145,9 @@ define i32 @ne_medium_ledge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, 33
 ; RV32IFDC-NEXT:    c.bnez a0, .LBB5_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB5_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_medium_ledge_neg:
 ; RV32IFD:       # %bb.0:
@@ -184,11 +172,9 @@ define i32 @ne_medium_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, -63
 ; RV32IFDC-NEXT:    c.bnez a0, .LBB6_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB6_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_medium_pos:
 ; RV32IFD:       # %bb.0:
@@ -213,11 +199,9 @@ define i32 @ne_medium_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, 63
 ; RV32IFDC-NEXT:    c.bnez a0, .LBB7_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB7_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_medium_neg:
 ; RV32IFD:       # %bb.0:
@@ -242,11 +226,9 @@ define i32 @ne_medium_bedge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, -2047
 ; RV32IFDC-NEXT:    c.bnez a0, .LBB8_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB8_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_medium_bedge_pos:
 ; RV32IFD:       # %bb.0:
@@ -271,11 +253,9 @@ define i32 @ne_medium_bedge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, 2047
 ; RV32IFDC-NEXT:    c.bnez a0, .LBB9_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB9_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_medium_bedge_neg:
 ; RV32IFD:       # %bb.0:
@@ -300,11 +280,9 @@ define i32 @ne_big_ledge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.slli a1, 11
 ; RV32IFDC-NEXT:    bne a0, a1, .LBB10_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB10_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_big_ledge_pos:
 ; RV32IFD:       # %bb.0:
@@ -329,11 +307,9 @@ define i32 @ne_big_ledge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a1, zero, -2048
 ; RV32IFDC-NEXT:    bne a0, a1, .LBB11_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB11_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: ne_big_ledge_neg:
 ; RV32IFD:       # %bb.0:
@@ -360,11 +336,9 @@ define i32 @eq_small_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, 20
 ; RV32IFDC-NEXT:    beq a0, a1, .LBB12_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB12_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_small_pos:
 ; RV32IFD:       # %bb.0:
@@ -388,11 +362,9 @@ define i32 @eq_small_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, -20
 ; RV32IFDC-NEXT:    beq a0, a1, .LBB13_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB13_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_small_neg:
 ; RV32IFD:       # %bb.0:
@@ -416,11 +388,9 @@ define i32 @eq_small_edge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, 31
 ; RV32IFDC-NEXT:    beq a0, a1, .LBB14_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB14_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_small_edge_pos:
 ; RV32IFD:       # %bb.0:
@@ -444,11 +414,9 @@ define i32 @eq_small_edge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.li a1, -32
 ; RV32IFDC-NEXT:    beq a0, a1, .LBB15_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB15_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_small_edge_neg:
 ; RV32IFD:       # %bb.0:
@@ -473,11 +441,9 @@ define i32 @eq_medium_ledge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, -33
 ; RV32IFDC-NEXT:    c.beqz a0, .LBB16_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB16_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_medium_ledge_pos:
 ; RV32IFD:       # %bb.0:
@@ -502,11 +468,9 @@ define i32 @eq_medium_ledge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, 33
 ; RV32IFDC-NEXT:    c.beqz a0, .LBB17_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB17_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_medium_ledge_neg:
 ; RV32IFD:       # %bb.0:
@@ -531,11 +495,9 @@ define i32 @eq_medium_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, -63
 ; RV32IFDC-NEXT:    c.beqz a0, .LBB18_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB18_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_medium_pos:
 ; RV32IFD:       # %bb.0:
@@ -560,11 +522,9 @@ define i32 @eq_medium_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, 63
 ; RV32IFDC-NEXT:    c.beqz a0, .LBB19_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB19_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_medium_neg:
 ; RV32IFD:       # %bb.0:
@@ -589,11 +549,9 @@ define i32 @eq_medium_bedge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, -2047
 ; RV32IFDC-NEXT:    c.beqz a0, .LBB20_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB20_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_medium_bedge_pos:
 ; RV32IFD:       # %bb.0:
@@ -618,11 +576,9 @@ define i32 @eq_medium_bedge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a0, a0, 2047
 ; RV32IFDC-NEXT:    c.beqz a0, .LBB21_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB21_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_medium_bedge_neg:
 ; RV32IFD:       # %bb.0:
@@ -647,11 +603,9 @@ define i32 @eq_big_ledge_pos(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    c.slli a1, 11
 ; RV32IFDC-NEXT:    beq a0, a1, .LBB22_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB22_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_big_ledge_pos:
 ; RV32IFD:       # %bb.0:
@@ -676,11 +630,9 @@ define i32 @eq_big_ledge_neg(i32 %in0) minsize {
 ; RV32IFDC-NEXT:    addi a1, zero, -2048
 ; RV32IFDC-NEXT:    beq a0, a1, .LBB23_2
 ; RV32IFDC-NEXT:  # %bb.1:
-; RV32IFDC-NEXT:    addi a0, zero, 42
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_1
 ; RV32IFDC-NEXT:  .LBB23_2:
-; RV32IFDC-NEXT:    addi a0, zero, -99
-; RV32IFDC-NEXT:    c.jr ra
+; RV32IFDC-NEXT:    tail OUTLINED_FUNCTION_0
 ;
 ; RV32IFD-LABEL: eq_big_ledge_neg:
 ; RV32IFD:       # %bb.0:
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-call.ll b/llvm/test/CodeGen/RISCV/machine-outliner-call.ll
new file mode 100644
index 00000000000000..b019cfe74864b0
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-call.ll
@@ -0,0 +1,70 @@
+; RUN: llc < %s -verify-machineinstrs -enable-machine-outliner | FileCheck %s
+
+target triple = "riscv64-unknown-linux-gnu"
+
+declare void @foo(i32, i32, i32, i32) minsize
+
+define void @fentry0(i1 %a) nounwind {
+; CHECK-LABEL: fentry0:
+; CHECK:       # %bb.1:
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
+; CHECK-NEXT:    call foo
+; CHECK-LABEL: .LBB0_2:
+; CHECK-NEXT:    tail OUTLINED_FUNCTION_[[BB2:[0-9]+]]
+entry:
+  br i1 %a, label %if.then, label %if.end
+if.then:
+  call void @foo(i32 1, i32 2, i32 3, i32 4)
+  br label %if.end
+if.end:
+  call void @foo(i32 5, i32 6, i32 7, i32 8)
+  ret void
+}
+
+define void @fentry1(i1 %a) nounwind {
+; CHECK-LABEL: fentry1:
+; CHECK:       # %bb.1:
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
+; CHECK-NEXT:    call foo
+; CHECK-LABEL: .LBB1_2:
+; CHECK-NEXT:    tail OUTLINED_FUNCTION_[[BB2:[0-9]+]]
+entry:
+  br i1 %a, label %if.then, label %if.end
+if.then:
+  call void @foo(i32 1, i32 2, i32 3, i32 4)
+  br label %if.end
+if.end:
+  call void @foo(i32 5, i32 6, i32 7, i32 8)
+  ret void
+}
+
+define void @fentry2(i1 %a) nounwind {
+; CHECK-LABEL: fentry2:
+; CHECK:       # %bb.1:
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
+; CHECK-NEXT:    call foo
+; CHECK-LABEL: .LBB2_2:
+; CHECK-NEXT:    tail OUTLINED_FUNCTION_[[BB2:[0-9]+]]
+entry:
+  br i1 %a, label %if.then, label %if.end
+if.then:
+  call void @foo(i32 1, i32 2, i32 3, i32 4)
+  br label %if.end
+if.end:
+  call void @foo(i32 5, i32 6, i32 7, i32 8)
+  ret void
+}
+
+; CHECK:       OUTLINED_FUNCTION_[[BB2]]:
+; CHECK:       li      a0, 5
+; CHECK-NEXT:  li      a1, 6
+; CHECK-NEXT:  li      a2, 7
+; CHECK-NEXT:  li      a3, 8
+; CHECK-NEXT:  call foo
+
+; CHECK:       OUTLINED_FUNCTION_[[BB1]]:
+; CHECK:       li      a0, 1
+; CHECK-NEXT:  li      a1, 2
+; CHECK-NEXT:  li      a2, 3
+; CHECK-NEXT:  li      a3, 4
+; CHECK-NEXT:  jr      t0
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
index 6ecca6a1b18ef8..2acb1d43e01eaf 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-cfi.mir
@@ -22,13 +22,11 @@ body:             |
     ; RV32I-MO-LABEL: name: func1
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
-    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV32I-MO-NEXT: PseudoRET
+    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ; RV64I-MO-LABEL: name: func1
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
-    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV64I-MO-NEXT: PseudoRET
+    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, 0
     $x11 = ORI $x11, 1023
@@ -49,13 +47,11 @@ body:             |
     ; RV32I-MO-LABEL: name: func2
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
-    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV32I-MO-NEXT: PseudoRET
+    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ; RV64I-MO-LABEL: name: func2
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
-    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV64I-MO-NEXT: PseudoRET
+    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, 0
     $x11 = ORI $x11, 1023
@@ -76,13 +72,11 @@ body:             |
     ; RV32I-MO-LABEL: name: func3
     ; RV32I-MO: liveins: $x10, $x11
     ; RV32I-MO-NEXT: {{  $}}
-    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV32I-MO-NEXT: PseudoRET
+    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ; RV64I-MO-LABEL: name: func3
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
-    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV64I-MO-NEXT: PseudoRET
+    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
     CFI_INSTRUCTION offset $x1, -12
     $x11 = ORI $x11, 1023
@@ -96,11 +90,11 @@ body:             |
 
 
 # OUTLINED-LABEL: name: OUTLINED_FUNCTION_0
-# OUTLINED: liveins: $x11, $x10, $x5
+# OUTLINED: liveins: $x11, $x10
 # OUTLINED-NEXT: {{  $}}
 # OUTLINED-NEXT: $x10 = ORI $x10, 1023
 # OUTLINED-NEXT: $x11 = ORI $x11, 1023
 # OUTLINED-NEXT: $x12 = ADDI $x10, 17
 # OUTLINED-NEXT: $x11 = AND $x12, $x11
 # OUTLINED-NEXT: $x10 = SUB $x10, $x11
-# OUTLINED-NEXT: $x0 = JALR $x5, 0
+# OUTLINED-NEXT: PseudoRET
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll b/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll
index 8fab0aa9b6a76c..0441361b117989 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-leaf-descendants.ll
@@ -94,7 +94,8 @@ define i32 @_Z2f6v() minsize {
 ; CHECK-BASELINE-NEXT:	li	a3, 0x4
 ; CHECK-BASELINE-NEXT:	li	a4, 0x5
 ; CHECK-BASELINE-NEXT:	li	a5, 0x6
-; CHECK-BASELINE-NEXT:	jr	t0
+; CHECK-BASELINE-NEXT:	auipc	t1, 0x0
+; CHECK-BASELINE-NEXT:	jr	t1
 
 ; CHECK-BASELINE: <OUTLINED_FUNCTION_1>:
 ; CHECK-BASELINE-NEXT:	li	a0, 0x1
@@ -102,8 +103,9 @@ define i32 @_Z2f6v() minsize {
 ; CHECK-BASELINE-NEXT:	li	a2, 0x3
 ; CHECK-BASELINE-NEXT:	li	a3, 0x4
 ; CHECK-BASELINE-NEXT:	li	a4, 0x5
-; CHECK-BASELINE-NEXT:	li	a5, 0x7
-; CHECK-BASELINE-NEXT:	jr	t0
+; CHECK-BASELINE-NEXT:	li	a5, 0x8
+; CHECK-BASELINE-NEXT:	auipc	t1, 0x0
+; CHECK-BASELINE-NEXT:	jr	t1
 
 ; CHECK-BASELINE: <OUTLINED_FUNCTION_2>:
 ; CHECK-BASELINE-NEXT:	li	a0, 0x1
@@ -111,8 +113,9 @@ define i32 @_Z2f6v() minsize {
 ; CHECK-BASELINE-NEXT:	li	a2, 0x3
 ; CHECK-BASELINE-NEXT:	li	a3, 0x4
 ; CHECK-BASELINE-NEXT:	li	a4, 0x5
-; CHECK-BASELINE-NEXT:	li	a5, 0x8
-; CHECK-BASELINE-NEXT:	jr	t0
+; CHECK-BASELINE-NEXT:	li	a5, 0x7
+; CHECK-BASELINE-NEXT:	auipc	t1, 0x0
+; CHECK-BASELINE-NEXT:	jr	t1
 
 ; CHECK-LEAF-DESCENDANTS: <OUTLINED_FUNCTION_0>:
 ; CHECK-LEAF-DESCENDANTS-NEXT:	li	a0, 0x1
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll b/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll
index 4ef3abd241577f..4a54a7289ddf27 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-patchable.ll
@@ -11,7 +11,11 @@ define void @fentry0(i1 %a) nounwind "fentry-call"="true" {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    # FEntry call
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
+; CHECK-NEXT:    call foo
+; CHECK-LABEL: .LBB0_2:
+; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
+; CHECK-NEXT:    call	foo
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
@@ -27,7 +31,11 @@ define void @fentry1(i1 %a) nounwind "fentry-call"="true" {
 ; CHECK-NEXT:  # %bb.0:
 ; CHECK-NEXT:    # FEntry call
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
+; CHECK-NEXT:    call foo
+; CHECK-LABEL: .LBB1_2:
+; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
+; CHECK-NEXT:    call	foo
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
@@ -47,7 +55,11 @@ define void @patchable0(i1 %a) nounwind "patchable-function-entry"="2" {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
+; CHECK-NEXT:    call foo
+; CHECK-LABEL: .LBB2_2:
+; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
+; CHECK-NEXT:    call	foo
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
@@ -65,7 +77,11 @@ define void @patchable1(i1 %a) nounwind "patchable-function-entry"="2" {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    nop
 ; CHECK:       # %bb.1:
-; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_1
+; CHECK-NEXT:    call t0, OUTLINED_FUNCTION_[[BB1:[0-9]+]]
+; CHECK-NEXT:    call foo
+; CHECK-LABEL: .LBB3_2:
+; CHECK-NEXT:    call	t0, OUTLINED_FUNCTION_[[BB2:[0-9]+]]
+; CHECK-NEXT:    call	foo
 entry:
   br i1 %a, label %if.then, label %if.end
 if.then:
diff --git a/llvm/test/CodeGen/RISCV/machine-outliner-position.mir b/llvm/test/CodeGen/RISCV/machine-outliner-position.mir
index 715e212eecabb3..47ec447f61d09c 100644
--- a/llvm/test/CodeGen/RISCV/machine-outliner-position.mir
+++ b/llvm/test/CodeGen/RISCV/machine-outliner-position.mir
@@ -25,15 +25,14 @@ body:             |
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV32I-MO-NEXT: EH_LABEL <mcsymbol .Ltmp0>
-    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV32I-MO-NEXT: PseudoRET
+    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ;
     ; RV64I-MO-LABEL: name: func1
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV64I-MO-NEXT: EH_LABEL <mcsymbol .Ltmp0>
-    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV64I-MO-NEXT: PseudoRET
+    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
     EH_LABEL <mcsymbol .Ltmp0>
     $x11 = ORI $x11, 1023
@@ -53,15 +52,14 @@ body:             |
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV32I-MO-NEXT: GC_LABEL <mcsymbol .Ltmp1>
-    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV32I-MO-NEXT: PseudoRET
+    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ;
     ; RV64I-MO-LABEL: name: func2
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV64I-MO-NEXT: GC_LABEL <mcsymbol .Ltmp1>
-    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV64I-MO-NEXT: PseudoRET
+    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
     GC_LABEL <mcsymbol .Ltmp1>
     $x11 = ORI $x11, 1023
@@ -81,15 +79,14 @@ body:             |
     ; RV32I-MO-NEXT: {{  $}}
     ; RV32I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV32I-MO-NEXT: ANNOTATION_LABEL <mcsymbol .Ltmp2>
-    ; RV32I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV32I-MO-NEXT: PseudoRET
+    ; RV32I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
+    ;
     ; RV64I-MO-LABEL: name: func3
     ; RV64I-MO: liveins: $x10, $x11
     ; RV64I-MO-NEXT: {{  $}}
     ; RV64I-MO-NEXT: $x10 = ORI $x10, 1023
     ; RV64I-MO-NEXT: ANNOTATION_LABEL <mcsymbol .Ltmp2>
-    ; RV64I-MO-NEXT: $x5 = PseudoCALLReg target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit-def $x5, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x10, implicit $x11
-    ; RV64I-MO-NEXT: PseudoRET
+    ; RV64I-MO-NEXT: PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x10 = ORI $x10, 1023
     ANNOTATION_LABEL <mcsymbol .Ltmp2>
     $x11 = ORI $x11, 1023
diff --git a/llvm/test/CodeGen/RISCV/machineoutliner-x5.mir b/llvm/test/CodeGen/RISCV/machineoutliner-x5.mir
new file mode 100644
index 00000000000000..14a9cd7f0dcffa
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/machineoutliner-x5.mir
@@ -0,0 +1,54 @@
+# Check that modifying X5 register is not a problem for machine outliner
+
+# RUN: llc -mtriple=riscv32 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \
+# RUN: | FileCheck -check-prefixes=CHECK,RV32I-MO %s
+# RUN: llc -mtriple=riscv64 -x mir -run-pass=machine-outliner -simplify-mir -verify-machineinstrs < %s \
+# RUN: | FileCheck -check-prefixes=CHECK,RV64I-MO %s
+
+--- |
+  define i32 @outline_tail_1(i32 %a, i32 %b) { ret i32 0 }
+
+  define i32 @outline_tail_2(i32 %a, i32 %b) { ret i32 0 }
+...
+---
+name:            outline_tail_1
+tracksRegLiveness: true
+isOutlined: false
+body:             |
+  bb.0:
+    liveins: $x10, $x11, $x5
+    ; RV32I-MO-LABEL: name: outline_tail_1
+    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
+    ;
+    ; RV64I-MO-LABEL: name: outline_tail_1
+    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x10 = ADD $x10, $x5
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    PseudoRET implicit $x10
+...
+---
+name:            outline_tail_2
+tracksRegLiveness: true
+isOutlined: false
+body:             |
+  bb.0:
+    liveins: $x10, $x11, $x5
+    ; RV32I-MO-LABEL: name: outline_tail_2
+    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
+    ;
+    ; RV64I-MO-LABEL: name: outline_tail_2
+    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x5, implicit $x10, implicit $x11
+    $x11 = ORI $x11, 1023
+    $x12 = ADDI $x10, 17
+    $x10 = ADD $x10, $x5
+    $x11 = AND $x12, $x11
+    $x10 = SUB $x10, $x11
+    PseudoRET implicit $x10
+...
+
+
+# CHECK-LABEL: name: OUTLINED_FUNCTION_0
+# CHECK: isOutlined: true
diff --git a/llvm/test/CodeGen/RISCV/machineoutliner.mir b/llvm/test/CodeGen/RISCV/machineoutliner.mir
index 0221257354fcfa..ab12bfbe1fafc4 100644
--- a/llvm/test/CodeGen/RISCV/machineoutliner.mir
+++ b/llvm/test/CodeGen/RISCV/machineoutliner.mir
@@ -29,10 +29,10 @@ body:             |
   bb.0:
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: outline_0
-    ; RV32I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: outline_0
-    ; RV64I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -48,10 +48,10 @@ body:             |
   bb.0:
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: outline_1
-    ; RV32I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: outline_1
-    ; RV64I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -67,10 +67,10 @@ body:             |
   bb.0:
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: outline_2
-    ; RV32I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV32I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: outline_2
-    ; RV64I-MO: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV64I-MO:         PseudoTAIL target-flags(riscv-call) @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -87,9 +87,11 @@ body:             |
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: dont_outline_0
     ; RV32I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV32I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: dont_outline_0
     ; RV64I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV64I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -106,9 +108,11 @@ body:             |
     liveins: $x10, $x11
     ; RV32I-MO-LABEL: name: dont_outline_1
     ; RV32I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV32I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: dont_outline_1
     ; RV64I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV64I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11
@@ -125,9 +129,11 @@ body:             |
     liveins: $x10, $x11, $x5
     ; RV32I-MO-LABEL: name: dont_outline_2
     ; RV32I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV32I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     ;
     ; RV64I-MO-LABEL: name: dont_outline_2
     ; RV64I-MO-NOT: $x5 = PseudoCALLReg {{.*}} @OUTLINED_FUNCTION_0
+    ; RV64I-MO-NOT: PseudoTAIL @OUTLINED_FUNCTION_0, implicit $x2, implicit-def $x10, implicit-def $x11, implicit-def $x12, implicit $x2, implicit $x10, implicit $x11
     $x11 = ORI $x11, 1023
     $x12 = ADDI $x10, 17
     $x11 = AND $x12, $x11

From f7dc1d0ac83b7c6b691167d8d02561ba0837b631 Mon Sep 17 00:00:00 2001
From: Anton Sidorenko <anton.sidorenko@syntacore.com>
Date: Tue, 26 Nov 2024 18:42:01 +0300
Subject: [PATCH 25/41] Revert "[Clang] Fix name lookup for dependent bases
 (#114978)" (#117727)

This reverts commit 486644723038555a224fd09d462bb5099e64809e as
requested by the commit author.

Buildbots fail:
* https://lab.llvm.org/buildbot/#/builders/164/builds/4945
* https://lab.llvm.org/buildbot/#/builders/52/builds/4021
---
 clang/docs/ReleaseNotes.rst      |  3 --
 clang/lib/AST/CXXInheritance.cpp | 18 ++++--------
 clang/test/CXX/drs/cwg5xx.cpp    | 48 ++------------------------------
 clang/www/cxx_dr_status.html     |  2 +-
 4 files changed, 9 insertions(+), 62 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index c4086a5bcbf368..6c40e48e2f49b3 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -279,9 +279,6 @@ Resolutions to C++ Defect Reports
   by default.
   (`CWG2521: User-defined literals and reserved identifiers <https://cplusplus.github.io/CWG/issues/2521.html>`_).
 
-- Fix name lookup for a dependent base class that is the current instantiation.  
-  (`CWG591: When a dependent base class is the current instantiation <https://cplusplus.github.io/CWG/issues/591.html>`_).
-
 C Language Changes
 ------------------
 
diff --git a/clang/lib/AST/CXXInheritance.cpp b/clang/lib/AST/CXXInheritance.cpp
index 10b8d524ff8978..aefc06e9197cfb 100644
--- a/clang/lib/AST/CXXInheritance.cpp
+++ b/clang/lib/AST/CXXInheritance.cpp
@@ -134,7 +134,7 @@ bool CXXRecordDecl::forallBases(ForallBasesCallback BaseMatches) const {
         return false;
 
       CXXRecordDecl *Base =
-          cast_if_present<CXXRecordDecl>(Ty->getDecl()->getDefinition());
+            cast_or_null<CXXRecordDecl>(Ty->getDecl()->getDefinition());
       if (!Base ||
           (Base->isDependentContext() &&
            !Base->isCurrentInstantiation(Record))) {
@@ -169,21 +169,13 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
     QualType BaseType =
         Context.getCanonicalType(BaseSpec.getType()).getUnqualifiedType();
 
-    bool isCurrentInstantiation = isa<InjectedClassNameType>(BaseType);
-    if (!isCurrentInstantiation) {
-      if (auto *BaseRecord = cast_if_present<CXXRecordDecl>(
-              BaseSpec.getType()->getAsRecordDecl()))
-        isCurrentInstantiation = BaseRecord->isDependentContext() &&
-                                 BaseRecord->isCurrentInstantiation(Record);
-    }
     // C++ [temp.dep]p3:
     //   In the definition of a class template or a member of a class template,
     //   if a base class of the class template depends on a template-parameter,
     //   the base class scope is not examined during unqualified name lookup
     //   either at the point of definition of the class template or member or
     //   during an instantiation of the class tem- plate or member.
-    if (!LookupInDependent &&
-        (BaseType->isDependentType() && !isCurrentInstantiation))
+    if (!LookupInDependent && BaseType->isDependentType())
       continue;
 
     // Determine whether we need to visit this base class at all,
@@ -251,8 +243,9 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
         return FoundPath;
       }
     } else if (VisitBase) {
-      CXXRecordDecl *BaseRecord = nullptr;
+      CXXRecordDecl *BaseRecord;
       if (LookupInDependent) {
+        BaseRecord = nullptr;
         const TemplateSpecializationType *TST =
             BaseSpec.getType()->getAs<TemplateSpecializationType>();
         if (!TST) {
@@ -271,7 +264,8 @@ bool CXXBasePaths::lookupInBases(ASTContext &Context,
             BaseRecord = nullptr;
         }
       } else {
-        BaseRecord = cast<CXXRecordDecl>(BaseSpec.getType()->getAsRecordDecl());
+        BaseRecord = cast<CXXRecordDecl>(
+            BaseSpec.getType()->castAs<RecordType>()->getDecl());
       }
       if (BaseRecord &&
           lookupInBases(Context, BaseRecord, BaseMatches, LookupInDependent)) {
diff --git a/clang/test/CXX/drs/cwg5xx.cpp b/clang/test/CXX/drs/cwg5xx.cpp
index 0d53a9d07d76de..ed0c7159dfc889 100644
--- a/clang/test/CXX/drs/cwg5xx.cpp
+++ b/clang/test/CXX/drs/cwg5xx.cpp
@@ -1178,61 +1178,17 @@ namespace cwg590 { // cwg590: yes
   template<typename T> typename A<T>::B::C A<T>::B::C::f(A<T>::B::C) {}
 }
 
-namespace cwg591 { // cwg591: yes
+namespace cwg591 { // cwg591: no
   template<typename T> struct A {
     typedef int M;
     struct B {
       typedef void M;
       struct C;
-      struct D;
-    };
-  };
-
-  template<typename T> struct G {
-    struct B {
-      typedef int M;
-      struct C {
-        typedef void M;
-        struct D;
-      };
-    };
-  };
-
-  template<typename T> struct H {
-    template<typename U> struct B {
-      typedef int M;
-      template<typename F> struct C {
-        typedef void M;
-        struct D;
-        struct P;
-      };
     };
   };
 
   template<typename T> struct A<T>::B::C : A<T> {
-    M m;
-  };
-
-  template<typename T> struct G<T>::B::C::D : B {
-    M m;
-  };
-
-  template<typename T>
-  template<typename U>
-  template<typename F>
-  struct H<T>::B<U>::C<F>::D : B<U> {
-    M m;
-  };
-
-  template<typename T> struct A<T>::B::D : A<T*> {
-    M m;
-    // expected-error@-1 {{field has incomplete type 'M' (aka 'void'}}
-  };
-
-  template<typename T>
-  template<typename U>
-  template<typename F>
-  struct H<T>::B<U>::C<F>::P : B<F> {
+    // FIXME: Should find member of non-dependent base class A<T>.
     M m;
     // expected-error@-1 {{field has incomplete type 'M' (aka 'void'}}
   };
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index c773c58fac4d0f..186f7cc0ace546 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -3599,7 +3599,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/591.html">591</a></td>
     <td>CD4</td>
     <td>When a dependent base class is the current instantiation</td>
-    <td class="none" align="center">Yes</td>
+    <td class="none" align="center">No</td>
   </tr>
   <tr id="592">
     <td><a href="https://cplusplus.github.io/CWG/issues/592.html">592</a></td>

From 86f7f089ee6bcf01bf082ca802220b1143a3ade9 Mon Sep 17 00:00:00 2001
From: Miro Bucko <mbucko@meta.com>
Date: Tue, 26 Nov 2024 11:29:24 -0500
Subject: [PATCH 26/41] Fix return value of 'PluginManager::RegisterPlugin()'.
 (#114120)

---
 lldb/source/Core/PluginManager.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Core/PluginManager.cpp b/lldb/source/Core/PluginManager.cpp
index a5219025495a91..80c9465f9af721 100644
--- a/lldb/source/Core/PluginManager.cpp
+++ b/lldb/source/Core/PluginManager.cpp
@@ -206,10 +206,9 @@ template <typename Instance> class PluginInstances {
     if (!callback)
       return false;
     assert(!name.empty());
-    Instance instance =
-        Instance(name, description, callback, std::forward<Args>(args)...);
-    m_instances.push_back(instance);
-    return false;
+    m_instances.emplace_back(name, description, callback,
+                             std::forward<Args>(args)...);
+    return true;
   }
 
   bool UnregisterPlugin(typename Instance::CallbackType callback) {

From 88cff867a58247d0c1da19e537eb8801a54ed38e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 26 Nov 2024 17:43:56 +0100
Subject: [PATCH 27/41] [InstCombine] Add tests for #113301 (NFC)

---
 .../InstCombine/select-value-equivalence.ll   | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/select-value-equivalence.ll b/llvm/test/Transforms/InstCombine/select-value-equivalence.ll
index da2e59d760f96f..d55766ddf40405 100644
--- a/llvm/test/Transforms/InstCombine/select-value-equivalence.ll
+++ b/llvm/test/Transforms/InstCombine/select-value-equivalence.ll
@@ -309,3 +309,28 @@ define <2 x float> @select_fcmp_fadd_une_zero_vec(<2 x float> %x, <2 x float> %y
   %retval = select <2 x i1> %fcmp, <2 x float> %x, <2 x float> %fadd
   ret <2 x float> %retval
 }
+
+define <2 x i8> @select_vec_op_const_no_undef(<2 x i8> %x) {
+; CHECK-LABEL: define <2 x i8> @select_vec_op_const_no_undef(
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq <2 x i8> [[X]], <i8 1, i8 2>
+; CHECK-NEXT:    [[XR:%.*]] = select <2 x i1> [[XZ]], <2 x i8> <i8 1, i8 2>, <2 x i8> <i8 4, i8 3>
+; CHECK-NEXT:    ret <2 x i8> [[XR]]
+;
+  %xz = icmp eq <2 x i8> %x, <i8 1, i8 2>
+  %xr = select <2 x i1> %xz, <2 x i8> %x, <2 x i8> <i8 4, i8 3>
+  ret <2 x i8> %xr
+}
+
+; FIXME: This is a miscompile.
+define <2 x i8> @select_vec_op_const_undef(<2 x i8> %x) {
+; CHECK-LABEL: define <2 x i8> @select_vec_op_const_undef(
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq <2 x i8> [[X]], <i8 1, i8 undef>
+; CHECK-NEXT:    [[XR:%.*]] = select <2 x i1> [[XZ]], <2 x i8> <i8 1, i8 undef>, <2 x i8> <i8 4, i8 3>
+; CHECK-NEXT:    ret <2 x i8> [[XR]]
+;
+  %xz = icmp eq <2 x i8> %x, <i8 1, i8 undef>
+  %xr = select <2 x i1> %xz, <2 x i8> %x, <2 x i8> <i8 4, i8 3>
+  ret <2 x i8> %xr
+}

From ced2fc7819d5ddea616ec330f18e08ff284c1868 Mon Sep 17 00:00:00 2001
From: Christopher Bate <cbate@nvidia.com>
Date: Tue, 26 Nov 2024 09:45:57 -0700
Subject: [PATCH 28/41] [mlir][bufferization] Fix OneShotBufferize when
 `defaultMemorySpaceFn` is used (#91524)

As described in issue llvm/llvm-project#91518, a previous PR
llvm/llvm-project#78484 introduced the `defaultMemorySpaceFn` into
bufferization options, allowing one to inform OneShotBufferize that it
should use a specified function to derive the memory space attribute
from the encoding attribute attached to tensor types.

However, introducing this feature exposed unhandled edge cases,
examples of which are introduced by this change in the new test under

`test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir`.

Fixing the inconsistencies introduced by `defaultMemorySpaceFn` is
pretty simple. This change:

- Updates the `bufferization.to_memref` and `bufferization.to_tensor`
  operations to explicitly include operand and destination types,
  whereas previously they relied on type inference to deduce the
  tensor types. Since the type inference cannot recover the correct
  tensor encoding/memory space, the operand and result types must be
  explicitly included. This is a small assembly format change, but it
  touches a large number of test files.

- Makes minor updates to other bufferization functions to handle the
  changes in building the above ops.

- Updates bufferization of `tensor.from_elements` to handle memory
  space.


Integration/upgrade guide:

In downstream projects, if you have tests or MLIR files that explicitly
use
`bufferization.to_tensor` or `bufferization.to_memref`, then update
them to the new assembly format as follows:

```
%1 = bufferization.to_memref %0 : memref<10xf32>
%2 = bufferization.to_tensor %1 : memref<10xf32>
```

becomes

```
%1 = bufferization.to_memref %0 : tensor<10xf32> to memref<10xf32>
%2 = bufferization.to_tensor %0 : memref<10xf32> to tensor<10xf32>
```
---
 mlir/docs/Bufferization.md                    |   4 +-
 .../Bufferization/IR/BufferizationOps.td      |  26 ++--
 .../Bufferization/Transforms/Passes.td        |   8 +-
 .../IR/BufferizableOpInterface.cpp            |   2 +-
 .../Bufferization/Transforms/Bufferize.cpp    |  21 +++-
 .../BufferizableOpInterfaceImpl.cpp           |  28 +++--
 .../BufferizableOpInterfaceImpl.cpp           |  12 +-
 mlir/test/Dialect/Affine/loop-fusion-4.mlir   |   4 +-
 mlir/test/Dialect/Arith/bufferize.mlir        |   6 +-
 .../dealloc-other.mlir                        |   2 +-
 .../one-shot-bufferize-analysis.mlir          |   4 +-
 .../one-shot-bufferize-encodings.mlir         | 111 +++++++++++++++++
 .../one-shot-bufferize-partial.mlir           |   6 +-
 .../Transforms/one-shot-bufferize.mlir        |   6 +-
 .../one-shot-module-bufferize-analysis.mlir   |   2 +-
 ...ule-bufferize-force-copy-before-write.mlir |   4 +-
 .../one-shot-module-bufferize-invalid.mlir    |   2 +-
 .../Transforms/one-shot-module-bufferize.mlir |   2 +-
 .../Dialect/Bufferization/canonicalize.mlir   |  32 ++---
 mlir/test/Dialect/Bufferization/ops.mlir      |   6 +-
 .../ControlFlow/one-shot-bufferize.mlir       |   4 +-
 mlir/test/Dialect/Linalg/bufferize.mlir       |  10 +-
 mlir/test/Dialect/Linalg/canonicalize.mlir    |   8 +-
 .../Dialect/MemRef/normalize-memrefs.mlir     |   4 +-
 mlir/test/Dialect/SCF/bufferize.mlir          |  12 +-
 .../SCF/one-shot-bufferize-encodings.mlir     |  73 +++++++++++
 mlir/test/Dialect/Shape/bufferize.mlir        |   2 +-
 .../SparseTensor/GPU/gpu_matmul24_lib.mlir    |   6 +-
 .../SparseTensor/GPU/gpu_matmul_lib.mlir      |   4 +-
 .../SparseTensor/GPU/gpu_matvec_lib.mlir      |   4 +-
 .../GPU/gpu_sampled_matmul_lib.mlir           |   4 +-
 .../SparseTensor/GPU/gpu_sddmm_lib.mlir       |   4 +-
 .../SparseTensor/constant_index_map.mlir      |   4 +-
 mlir/test/Dialect/SparseTensor/dense.mlir     |   6 +-
 .../fuse_sparse_pad_with_consumer.mlir        |   4 +-
 .../test/Dialect/SparseTensor/sorted_coo.mlir |   4 +-
 mlir/test/Dialect/SparseTensor/sparse_1d.mlir |  28 ++---
 mlir/test/Dialect/SparseTensor/sparse_2d.mlir |  82 ++++++-------
 mlir/test/Dialect/SparseTensor/sparse_3d.mlir | 114 +++++++++---------
 .../Dialect/SparseTensor/sparse_affine.mlir   |  16 +--
 .../Dialect/SparseTensor/sparse_batch.mlir    |   2 +-
 .../Dialect/SparseTensor/sparse_fp_ops.mlir   |  34 +++---
 .../Dialect/SparseTensor/sparse_fusion.mlir   |  10 +-
 .../Dialect/SparseTensor/sparse_int_ops.mlir  |  34 +++---
 .../Dialect/SparseTensor/sparse_kernels.mlir  |  18 +--
 .../sparse_kernels_to_iterator.mlir           |   2 +-
 .../Dialect/SparseTensor/sparse_lower.mlir    |   8 +-
 .../SparseTensor/sparse_lower_col.mlir        |   8 +-
 .../SparseTensor/sparse_lower_inplace.mlir    |   8 +-
 mlir/test/Dialect/SparseTensor/sparse_nd.mlir |   4 +-
 .../Dialect/SparseTensor/sparse_outbuf.mlir   |   6 +-
 .../Dialect/SparseTensor/sparse_pack.mlir     |  12 +-
 .../SparseTensor/sparse_parallel_reduce.mlir  |   4 +-
 .../Dialect/SparseTensor/sparse_perm.mlir     |   4 +-
 .../SparseTensor/sparse_perm_lower.mlir       |   4 +-
 .../Dialect/SparseTensor/sparse_scalars.mlir  |   4 +-
 .../Dialect/SparseTensor/sparse_sddmm.mlir    |  10 +-
 .../SparseTensor/sparse_sddmm_org.mlir        |   4 +-
 .../SparseTensor/sparse_vector_chain.mlir     |   2 +-
 .../SparseTensor/sparse_vector_index.mlir     |   4 +-
 mlir/test/Dialect/SparseTensor/spy_sddmm.mlir |   4 +-
 .../Dialect/SparseTensor/spy_sddmm_bsr.mlir   |   4 +-
 .../Dialect/SparseTensor/unused-tensor.mlir   |   4 +-
 .../SparseTensor/vectorize_reduction.mlir     |  28 ++---
 mlir/test/Dialect/Tensor/bufferize.mlir       |  34 +++---
 .../Tensor/one-shot-bufferize-encodings.mlir  |  20 +++
 .../Dialect/Tensor/one-shot-bufferize.mlir    |   2 +-
 mlir/test/Dialect/Vector/bufferize.mlir       |   6 +-
 .../GPU/CUDA/sparse-matvec-const.mlir         |   2 +-
 .../GPU/CUDA/sparse-mma-2-4-f16.mlir          |   2 +-
 .../Tosa/CPU/test-maxpool-dynamic.mlir        |   4 +-
 .../Dialect/Vector/CPU/AMX/mulf-full.mlir     |   4 +-
 .../Dialect/Vector/CPU/AMX/muli-full.mlir     |   4 +-
 73 files changed, 608 insertions(+), 373 deletions(-)
 create mode 100644 mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir
 create mode 100644 mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir
 create mode 100644 mlir/test/Dialect/Tensor/one-shot-bufferize-encodings.mlir

diff --git a/mlir/docs/Bufferization.md b/mlir/docs/Bufferization.md
index e16fe91212a1a5..02cfee5f2b8dcd 100644
--- a/mlir/docs/Bufferization.md
+++ b/mlir/docs/Bufferization.md
@@ -223,8 +223,8 @@ func.func @test_matmul(%A: memref<1x17x19xf32>,
                        %B: memref<1x19x29xf32>,
                        %C: memref<1x17x29xf32>) {
 
-  %A_tensor = bufferization.to_tensor %A restrict : memref<1x17x19xf32>
-  %B_tensor = bufferization.to_tensor %B restrict : memref<1x19x29xf32>
+  %A_tensor = bufferization.to_tensor %A restrict : memref<1x17x19xf32> to tensor<1x17x19xf32>
+  %B_tensor = bufferization.to_tensor %B restrict : memref<1x19x29xf32> to tensor<1x19x29xf32>
 
   %0 = tosa.matmul %A_tensor, %B_tensor
       : (tensor<1x17x19xf32>, tensor<1x19x29xf32>) ->
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
index 7bcc3b9e799865..fad78a63444b91 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizationOps.td
@@ -387,9 +387,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
     BufferizableOpInterface,
     SameOperandsAndResultShape,
     SameOperandsAndResultElementType,
-    TypesMatchWith<"result type matches tensor equivalent of 'memref'",
-                   "memref", "result",
-                   "memref::getTensorTypeFromMemRefType($_self)">
+    AllElementTypesMatch<["memref", "result"]>
   ]> {
   let summary = "create a tensor from a `memref`";
   let description = [{
@@ -404,7 +402,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
 
     ```mlir
     // Produces a value of tensor<4x?xf32> type.
-    %t = bufferization.to_tensor %m : memref<4x?xf32, #layout, 0>
+    %t = bufferization.to_tensor %m : memref<4x?xf32, #layout, 0> to tensor<4x?xf32>
     ```
 
     If the `writable` unit attribute is set, the produced tensor is considered
@@ -427,7 +425,7 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
     Example:
 
     ```
-    %t = bufferization.to_tensor %m restrict writable : memref<4xf32>
+    %t = bufferization.to_tensor %m restrict writable : memref<4xf32> to tensor<4xf32>
 
     // %t is writable, so the tensor.insert may bufferize in-place in the
     // absence of other conflicts.
@@ -476,9 +474,16 @@ def Bufferization_ToTensorOp : Bufferization_Op<"to_tensor", [
 
   let assemblyFormat = [{
     $memref (`restrict` $restrict^)? (`writable` $writable^)? attr-dict
-      `:` type($memref)
+      `:` type($memref) `to` type($result)
   }];
 
+  let builders = [
+    OpBuilder<(ins "Value":$memref, CArg<"bool", "false">:$restrict, CArg<"bool", "false">:$writeable), [{
+      auto rtt = memref::getTensorTypeFromMemRefType(memref.getType());
+      build($_builder, $_state, rtt, memref, restrict, writeable);
+    }]>
+  ];
+
   let hasCanonicalizer = 1;
   let hasFolder = 1;
 }
@@ -493,9 +498,8 @@ def Bufferization_ToMemrefOp : Bufferization_Op<"to_memref", [
     SameOperandsAndResultShape,
     SameOperandsAndResultElementType,
     Pure,
-    TypesMatchWith<"type of 'tensor' is the tensor equivalent of 'memref'",
-                   "memref", "tensor",
-                   "memref::getTensorTypeFromMemRefType($_self)">
+    AllShapesMatch<["memref", "tensor"]>,
+    AllElementTypesMatch<["memref", "tensor"]>
   ]> {
   let summary = "cast a tensor to memref";
   let description = [{
@@ -503,7 +507,7 @@ def Bufferization_ToMemrefOp : Bufferization_Op<"to_memref", [
 
     ```mlir
     // Result type is memref<4x?xf32, #layout, 0>
-    %m = bufferization.to_memref %t : memref<4x?xf32, #layout, 0>
+    %m = bufferization.to_memref %t : tensor<4x?xf32> to memref<4x?xf32, #layout, 0>
     ```
 
     This operation is a specialized variant of the built-in
@@ -550,7 +554,7 @@ def Bufferization_ToMemrefOp : Bufferization_Op<"to_memref", [
   }];
 
   let assemblyFormat = [{
-    $tensor (`read_only` $read_only^)? attr-dict `:` type($memref)
+    $tensor (`read_only` $read_only^)? attr-dict `:` type($tensor) `to` type($memref)
   }];
 
   let hasFolder = 1;
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index 3e93f33ffe0fb4..3bcde8edde5098 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -250,8 +250,8 @@ def OptimizeAllocationLiveness
   let summary = "This pass optimizes the liveness of temp allocations in the "
                 "input function";
   let description =
-       [{This pass will find all operations that have a memory allocation effect. 
-       It will search for the corresponding deallocation and move it right after 
+       [{This pass will find all operations that have a memory allocation effect.
+       It will search for the corresponding deallocation and move it right after
        the last user of the allocation.
        This will optimize the liveness of the allocations.
 
@@ -510,6 +510,10 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
            /*default=*/"false",
            "The memory space of an memref types must always be inferred. If "
            "unset, a default memory space of 0 is used otherwise.">,
+    Option<"useEncodingForMemorySpace", "use-encoding-for-memory-space", "bool",
+            /*default=*/"false",
+            "Use the Tensor encoding attribute for the memory space. Exclusive to"
+            " the 'must-infer-memory-space' option">,
     Option<"testAnalysisOnly", "test-analysis-only", "bool",
             /*default=*/"false",
            "Test only: Only run inplaceability analysis and annotate IR">,
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 85604eef2f2830..065739ea8e5951 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -718,7 +718,7 @@ void bufferization::replaceOpWithBufferizedValues(RewriterBase &rewriter,
       // loose all of its users and eventually DCE away.
       rewriter.setInsertionPointAfter(op);
       replacement = rewriter.create<bufferization::ToTensorOp>(
-          replacement.getLoc(), replacement);
+          replacement.getLoc(), opResult.getType(), replacement);
     }
     replacements.push_back(replacement);
   }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 62ce2583f4fa1d..c6a0320d24b5eb 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -69,7 +69,7 @@ BufferizeTypeConverter::BufferizeTypeConverter() {
     if (auto inputType = dyn_cast<MemRefType>(inputs[0].getType())) {
       // MemRef to MemRef cast.
       assert(inputType != type && "expected different types");
-      // Unranked to ranked and ranked to unranked casts must be explicit.
+      // Ranked to unranked casts must be explicit.
       auto rankedDestType = dyn_cast<MemRefType>(type);
       if (!rankedDestType)
         return nullptr;
@@ -147,12 +147,31 @@ struct OneShotBufferizePass
       opt.dumpAliasSets = dumpAliasSets;
       opt.setFunctionBoundaryTypeConversion(
           parseLayoutMapOption(functionBoundaryTypeConversion));
+
+      if (mustInferMemorySpace && useEncodingForMemorySpace) {
+        emitError(getOperation()->getLoc())
+            << "only one of 'must-infer-memory-space' and "
+               "'use-encoding-for-memory-space' are allowed in "
+            << getArgument();
+        return signalPassFailure();
+      }
+
       if (mustInferMemorySpace) {
         opt.defaultMemorySpaceFn =
             [](TensorType t) -> std::optional<Attribute> {
           return std::nullopt;
         };
       }
+
+      if (useEncodingForMemorySpace) {
+        opt.defaultMemorySpaceFn =
+            [](TensorType t) -> std::optional<Attribute> {
+          if (auto rtt = dyn_cast<RankedTensorType>(t))
+            return rtt.getEncoding();
+          return std::nullopt;
+        };
+      }
+
       opt.printConflicts = printConflicts;
       opt.bufferAlignment = bufferAlignment;
       opt.testAnalysisOnly = testAnalysisOnly;
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
index 779c41a22e9ee2..e9d7dc1b847c61 100644
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -203,7 +203,8 @@ struct ExecuteRegionOpInterface
     for (const auto &it : llvm::enumerate(executeRegionOp->getResultTypes())) {
       if (isa<TensorType>(it.value())) {
         newResults.push_back(rewriter.create<bufferization::ToTensorOp>(
-            executeRegionOp.getLoc(), newOp->getResult(it.index())));
+            executeRegionOp.getLoc(), it.value(),
+            newOp->getResult(it.index())));
       } else {
         newResults.push_back(newOp->getResult(it.index()));
       }
@@ -485,15 +486,17 @@ getBuffers(RewriterBase &rewriter, const MutableOperandRange &operands,
 /// ToTensorOps, so that the block body can be moved over to the new op.
 static SmallVector<Value>
 getBbArgReplacements(RewriterBase &rewriter, Block::BlockArgListType bbArgs,
+                     Block::BlockArgListType oldBbArgs,
                      const DenseSet<int64_t> &tensorIndices) {
   SmallVector<Value> result;
   for (const auto &it : llvm::enumerate(bbArgs)) {
     size_t idx = it.index();
     Value val = it.value();
     if (tensorIndices.contains(idx)) {
-      result.push_back(
-          rewriter.create<bufferization::ToTensorOp>(val.getLoc(), val)
-              .getResult());
+      result.push_back(rewriter
+                           .create<bufferization::ToTensorOp>(
+                               val.getLoc(), oldBbArgs[idx].getType(), val)
+                           .getResult());
     } else {
       result.push_back(val);
     }
@@ -763,7 +766,8 @@ struct ForOpInterface
     // iter_args of the new loop in ToTensorOps.
     rewriter.setInsertionPointToStart(loopBody);
     SmallVector<Value> iterArgs =
-        getBbArgReplacements(rewriter, newForOp.getRegionIterArgs(), indices);
+        getBbArgReplacements(rewriter, newForOp.getRegionIterArgs(),
+                             forOp.getRegionIterArgs(), indices);
     iterArgs.insert(iterArgs.begin(), newForOp.getInductionVar());
 
     // Move loop body to new loop.
@@ -1000,16 +1004,18 @@ struct WhileOpInterface
     // The old block uses tensors, so wrap the (memref) bbArgs of the new block
     // in ToTensorOps.
     rewriter.setInsertionPointToStart(newBeforeBody);
-    SmallVector<Value> newBeforeArgs = getBbArgReplacements(
-        rewriter, newWhileOp.getBeforeArguments(), indicesBefore);
+    SmallVector<Value> newBeforeArgs =
+        getBbArgReplacements(rewriter, newWhileOp.getBeforeArguments(),
+                             whileOp.getBeforeArguments(), indicesBefore);
     rewriter.mergeBlocks(whileOp.getBeforeBody(), newBeforeBody, newBeforeArgs);
 
     // Set up new iter_args and move the loop body block to the new op.
     // The old block uses tensors, so wrap the (memref) bbArgs of the new block
     // in ToTensorOps.
     rewriter.setInsertionPointToStart(newAfterBody);
-    SmallVector<Value> newAfterArgs = getBbArgReplacements(
-        rewriter, newWhileOp.getAfterArguments(), indicesAfter);
+    SmallVector<Value> newAfterArgs =
+        getBbArgReplacements(rewriter, newWhileOp.getAfterArguments(),
+                             whileOp.getAfterArguments(), indicesAfter);
     rewriter.mergeBlocks(whileOp.getAfterBody(), newAfterBody, newAfterArgs);
 
     // Replace loop results.
@@ -1255,8 +1261,8 @@ struct ForallOpInterface
              forallOp.getBody()->getArguments().drop_front(rank), buffers)) {
       BlockArgument bbArg = std::get<0>(it);
       Value buffer = std::get<1>(it);
-      Value bufferAsTensor =
-          rewriter.create<ToTensorOp>(forallOp.getLoc(), buffer);
+      Value bufferAsTensor = rewriter.create<ToTensorOp>(
+          forallOp.getLoc(), bbArg.getType(), buffer);
       bbArg.replaceAllUsesWith(bufferAsTensor);
     }
 
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
index c2b8614148bf25..9797b73f534a96 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -480,10 +480,6 @@ struct FromElementsOpInterface
     auto fromElementsOp = cast<tensor::FromElementsOp>(op);
     auto tensorType = cast<RankedTensorType>(fromElementsOp.getType());
 
-    // TODO: Implement memory space for this op.
-    if (options.defaultMemorySpaceFn(tensorType) != Attribute())
-      return op->emitError("memory space not implemented yet");
-
     // Allocate a buffer for the result.
     Location loc = op->getLoc();
     auto shape = tensorType.getShape();
@@ -493,10 +489,12 @@ struct FromElementsOpInterface
         /*copy=*/false);
     if (failed(tensorAlloc))
       return failure();
-    auto memrefType =
-        MemRefType::get(tensorType.getShape(), tensorType.getElementType());
+    FailureOr<BaseMemRefType> memrefType =
+        bufferization::getBufferType(*tensorAlloc, options);
+    if (failed(memrefType))
+      return failure();
     Value buffer = rewriter.create<bufferization::ToMemrefOp>(
-        op->getLoc(), memrefType, *tensorAlloc);
+        op->getLoc(), *memrefType, *tensorAlloc);
 
     // Case: tensor<0xelem_type>.
     if (fromElementsOp.getElements().empty()) {
diff --git a/mlir/test/Dialect/Affine/loop-fusion-4.mlir b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
index f46ad0f5e4c232..ea144f73bb21c6 100644
--- a/mlir/test/Dialect/Affine/loop-fusion-4.mlir
+++ b/mlir/test/Dialect/Affine/loop-fusion-4.mlir
@@ -242,7 +242,7 @@ module {
     ^bb0(%arg1: index, %arg2: index, %arg3: index, %arg4: index):
       tensor.yield %cst_f32 : f32
     } : tensor<1x32x32x8xf32> to tensor<1x40x8229x8xf32>
-    %1 = bufferization.to_memref %padded : memref<1x40x8229x8xf32>
+    %1 = bufferization.to_memref %padded : tensor<1x40x8229x8xf32> to memref<1x40x8229x8xf32>
     %alloc_0 = memref.alloc() {alignment = 64 : i64} : memref<1x32x32x8xf32>
     affine.for %arg1 = 0 to 1 {
       affine.for %arg2 = 0 to 32 {
@@ -280,7 +280,7 @@ module {
     // SPIRV-NOT:       affine.for %{{.*}}
 
     // SPIRV:       ReturnValue
-    %2 = bufferization.to_tensor %alloc_1 : memref<1x32x32x8xf32>
+    %2 = bufferization.to_tensor %alloc_1 : memref<1x32x32x8xf32> to tensor<1x32x32x8xf32>
     %3 = builtin.unrealized_conversion_cast %2 : tensor<1x32x32x8xf32> to !spirv.array<8192 x f32>
     spirv.ReturnValue %3 : !spirv.array<8192 x f32>
   }
diff --git a/mlir/test/Dialect/Arith/bufferize.mlir b/mlir/test/Dialect/Arith/bufferize.mlir
index a3b1454fb68f66..0b7838e1471d3d 100644
--- a/mlir/test/Dialect/Arith/bufferize.mlir
+++ b/mlir/test/Dialect/Arith/bufferize.mlir
@@ -7,7 +7,7 @@ func.func @index_cast(%tensor: tensor<i32>, %scalar: i32) -> (tensor<index>, ind
   %index_scalar = arith.index_cast %scalar : i32 to index
   return %index_tensor, %index_scalar : tensor<index>, index
 }
-// CHECK:  %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<i32>
+// CHECK:  %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<i32>
 // CHECK-NEXT: %[[INDEX_MEMREF:.*]] = arith.index_cast %[[MEMREF]]
 // CHECK-SAME:   memref<i32> to memref<index>
 // CHECK-NEXT: %[[INDEX_TENSOR:.*]] = bufferization.to_tensor %[[INDEX_MEMREF]]
@@ -83,8 +83,8 @@ func.func @non_tensor() {
 // CHECK-SAME:                 %[[PRED:.*]]: i1,
 // CHECK-SAME:                 %[[TRUE_VAL:.*]]: tensor<f32>,
 // CHECK-SAME:                 %[[FALSE_VAL:.*]]: tensor<f32>) -> tensor<f32> {
-// CHECK-DAG:           %[[TRUE_VAL_MEMREF:.*]] = bufferization.to_memref %[[TRUE_VAL]] : memref<f32>
-// CHECK-DAG:           %[[FALSE_VAL_MEMREF:.*]] = bufferization.to_memref %[[FALSE_VAL]] : memref<f32>
+// CHECK-DAG:           %[[TRUE_VAL_MEMREF:.*]] = bufferization.to_memref %[[TRUE_VAL]] : tensor<f32>
+// CHECK-DAG:           %[[FALSE_VAL_MEMREF:.*]] = bufferization.to_memref %[[FALSE_VAL]] : tensor<f32>
 // CHECK:           %[[RET_MEMREF:.*]] = arith.select %[[PRED]], %[[TRUE_VAL_MEMREF]], %[[FALSE_VAL_MEMREF]] : memref<f32>
 // CHECK:           %[[RET:.*]] = bufferization.to_tensor %[[RET_MEMREF]] : memref<f32>
 // CHECK:           return %[[RET]] : tensor<f32>
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
index 5293977fe733f5..5d0657eb38baa6 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
@@ -9,7 +9,7 @@
 //  CHECK-NEXT:   %[[clone:.*]] = bufferization.clone %[[m]]
 //  CHECK-NEXT:   return %[[clone]]
 func.func private @no_interface_no_operands(%t : tensor<?x?x?xf16>) -> memref<?x?x?xf16> {
-  %0 = bufferization.to_memref %t : memref<?x?x?xf16>
+  %0 = bufferization.to_memref %t : tensor<?x?x?xf16> to memref<?x?x?xf16>
   return %0 : memref<?x?x?xf16>
 }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir
index c3e44c426797f3..7d429e48401144 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-analysis.mlir
@@ -96,7 +96,7 @@ func.func @to_memref_not_read_only(%idx : index, %f: f32) -> f32 {
   // Some op may write into the result of to_memref later.
   // CHECK: bufferization.to_memref
   // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
-  %m = bufferization.to_memref %t : memref<5xf32>
+  %m = bufferization.to_memref %t : tensor<5xf32> to memref<5xf32>
   %2 = tensor.extract %t[%idx] : tensor<5xf32>
   return %2 : f32
 }
@@ -112,7 +112,7 @@ func.func @to_memref_read_only(%idx : index, %f: f32) -> f32 {
   // Some op may write into the result of to_memref later.
   // CHECK: bufferization.to_memref
   // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-  %m = bufferization.to_memref %t {read_only} : memref<5xf32>
+  %m = bufferization.to_memref %t {read_only} : tensor<5xf32> to memref<5xf32>
   %2 = tensor.extract %t[%idx] : tensor<5xf32>
   return %2 : f32
 }
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir
new file mode 100644
index 00000000000000..c26f1681e4d96b
--- /dev/null
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-encodings.mlir
@@ -0,0 +1,111 @@
+// RUN: mlir-opt %s -one-shot-bufferize="use-encoding-for-memory-space" -split-input-file | FileCheck %s
+
+func.func @alloc_tesor_with_space_no_encoding() -> tensor<128xf32> {
+  %0 = bufferization.alloc_tensor() {memory_space = 1 : i64} : tensor<128xf32>
+  return %0 : tensor<128xf32>
+}
+
+// CHECK-LABEL: @alloc_tesor_with_space_no_encoding
+//  CHECK-SAME: () -> tensor<128xf32> {
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1>
+//       CHECK:     %[[v0:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 1> to tensor<128xf32>
+//       CHECK:     return %[[v0]] : tensor<128xf32>
+
+// -----
+
+func.func @alloc_tesor_with_space_and_cast() -> tensor<128xf32, 1> {
+  %0 = bufferization.alloc_tensor() {memory_space = 1 : i64} : tensor<128xf32>
+  %1 = tensor.cast %0 : tensor<128xf32> to tensor<128xf32, 1>
+  return %1 : tensor<128xf32, 1>
+}
+
+// CHECK-LABEL: @alloc_tesor_with_space_and_cast
+//  CHECK-SAME: () -> tensor<128xf32, 1 : i64> {
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1>
+//       CHECK:     %[[v0:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 1> to tensor<128xf32, 1 : i64>
+//       CHECK:     return %[[v0]] : tensor<128xf32, 1 : i64>
+
+// -----
+
+func.func @alloc_tesor_with_space_with_encoding() -> tensor<128xf32, 1 : i64> {
+  %0 = bufferization.alloc_tensor() {memory_space = 1 : i64} : tensor<128xf32, 1 : i64>
+  return %0 : tensor<128xf32, 1 : i64>
+}
+
+// CHECK-LABEL: @alloc_tesor_with_space_with_encoding
+//  CHECK-SAME: () -> tensor<128xf32, 1 : i64> {
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1>
+//       CHECK:     %[[v0:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 1> to tensor<128xf32, 1 : i64>
+//       CHECK:     return %[[v0]] : tensor<128xf32, 1 : i64>
+
+// -----
+
+func.func @alloc_tesor_copy_from_default_space(%arg0: tensor<128xf32>) -> tensor<128xf32> {
+  %0 = bufferization.alloc_tensor() copy(%arg0) {memory_space = 1 : i64} : tensor<128xf32>
+  return %0 : tensor<128xf32>
+}
+
+// CHECK-LABEL: @alloc_tesor_copy_from_default_space
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32>) -> tensor<128xf32> {
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32> to memref<128xf32, strided<[?], offset: ?>>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1>
+//       CHECK:     memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>> to memref<128xf32, 1>
+//       CHECK:     %[[v1:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 1> to tensor<128xf32>
+//       CHECK:     return %[[v1]] : tensor<128xf32>
+
+// -----
+
+func.func @alloc_tesor_copy_from_non_default_space(%arg0: tensor<128xf32, 1>) -> tensor<128xf32, 2> {
+  %0 = bufferization.alloc_tensor() copy(%arg0) {memory_space = 2 : i64} : tensor<128xf32, 1>
+  %1 = tensor.cast %0 : tensor<128xf32, 1> to tensor<128xf32, 2>
+  return %1 : tensor<128xf32, 2>
+}
+
+// CHECK-LABEL: @alloc_tesor_copy_from_non_default_space
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>) -> tensor<128xf32, 2 : i64> {
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 2>
+//       CHECK:     memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 2>
+//       CHECK:     %[[v1:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 2> to tensor<128xf32, 2 : i64>
+//       CHECK:     return %[[v1]] : tensor<128xf32, 2 : i64>
+
+// -----
+
+// TODO: this should be illegal since ultimately we can not eliminate the `bufferization.to_tensor` when we
+// bufferize function boundaries.
+func.func @alloc_tesor_copy_from_non_default_space_no_cast(%arg0: tensor<128xf32, 1>,
+                                                           %arg1: tensor<4xf32, 1>) -> tensor<128xf32, 1> {
+  %0 = bufferization.alloc_tensor() copy(%arg0) {memory_space = 2 : i64} : tensor<128xf32, 1>
+  %1 = tensor.insert_slice %arg1 into %arg0 [0][4][1] : tensor<4xf32, 1> into tensor<128xf32, 1>
+  return %0 : tensor<128xf32, 1>
+}
+
+// CHECK-LABEL: @alloc_tesor_copy_from_non_default_space_no_cast
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>, %[[arg1:.+]]: tensor<4xf32, 1 : i64>) -> tensor<128xf32, 1 : i64> {
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg1]] : tensor<4xf32, 1 : i64> to memref<4xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v1:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v2:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 2>
+//       CHECK:     memref.copy %[[v2]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 2>
+//       CHECK:     %[[v3:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 2> to tensor<128xf32, 1 : i64>
+//       CHECK:     %[[alloc_0:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1>
+//       CHECK:     memref.copy %[[v1]], %[[alloc_0]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 1>
+//       CHECK:     %[[subview:.+]] = memref.subview %[[alloc_0]][0] [4] [1] : memref<128xf32, 1> to memref<4xf32, strided<[1]>, 1>
+//       CHECK:     memref.copy %[[v0]], %[[subview]] : memref<4xf32, strided<[?], offset: ?>, 1> to memref<4xf32, strided<[1]>, 1>
+//       CHECK:     return %[[v3]] : tensor<128xf32, 1 : i64>
+
+// -----
+
+func.func @materialize_in_destination(%arg0: tensor<128xf32, 1>) -> tensor<128xf32, 2> {
+  %0 = bufferization.alloc_tensor () {memory_space = 2 : i64} : tensor<128xf32, 2>
+  %1 = bufferization.materialize_in_destination %arg0 in %0 : (tensor<128xf32, 1>, tensor<128xf32, 2>) -> tensor<128xf32, 2>
+  return %1 : tensor<128xf32, 2>
+}
+
+// CHECK-LABEL: @materialize_in_destination
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>) -> tensor<128xf32, 2 : i64> {
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 2>
+//       CHECK:     memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 2>
+//       CHECK:     %[[v1:.+]] = bufferization.to_tensor %[[alloc]] : memref<128xf32, 2> to tensor<128xf32, 2 : i64>
+//       CHECK:     return %[[v1]] : tensor<128xf32, 2 : i64>
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
index 9380c81ce235cd..194c3278c78a1d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-partial.mlir
@@ -25,9 +25,9 @@ func.func @use_of_unknown_op_1(%t1: tensor<?xf32>)
 
   %idx = arith.constant 0 : index
   %cst = arith.constant 0.0 : f32
-  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : memref<?xf32, strided<[?], offset: ?>>
+  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : tensor<?xf32> to memref<?xf32, strided<[?], offset: ?>>
   // CHECK: vector.transfer_read %[[dummy_memref]][%{{.*}}], %{{.*}} : memref<?xf32, strided<[?], offset: ?>>
-  // CHECK-NO-LAYOUT-MAP: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : memref<?xf32>
+  // CHECK-NO-LAYOUT-MAP: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : tensor<?xf32> to memref<?xf32>
   // CHECK-NO-LAYOUT-MAP: vector.transfer_read %[[dummy_memref]][%{{.*}}], %{{.*}} : memref<?xf32>
   %1 = vector.transfer_read %0[%idx], %cst : tensor<?xf32>, vector<5xf32>
   return %1 : vector<5xf32>
@@ -61,7 +61,7 @@ func.func @use_of_unknown_op_3(%t1: tensor<?xf32>)
 
   // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1]])
   %0 = "test.dummy_op"(%t1) : (tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : memref<?xf32, strided<[?], offset: ?>>
+  // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] : tensor<?xf32> to memref<?xf32, strided<[?], offset: ?>>
   // CHECK: %[[v2:.*]] = vector.transfer_read %[[dummy_memref]]
   %2 = vector.transfer_read %0[%idx], %cst : tensor<?xf32>, vector<5xf32>
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
index dbf8d6563477b5..e65c5b92949f6e 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize.mlir
@@ -134,7 +134,7 @@ func.func @copy_deallocated() -> tensor<10xf32> {
 // CHECK-LABEL: func @select_different_tensors(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?xf32>
 func.func @select_different_tensors(%t: tensor<?xf32>, %sz: index, %pos: index, %c: i1) -> f32 {
-  // CHECK-DAG: %[[m:.*]] = bufferization.to_memref %[[t]] : memref<?xf32, strided{{.*}}>
+  // CHECK-DAG: %[[m:.*]] = bufferization.to_memref %[[t]] : tensor<?xf32> to memref<?xf32, strided{{.*}}>
   // CHECK-DAG: %[[alloc:.*]] = memref.alloc(%{{.*}}) {{.*}} : memref<?xf32>
   %0 = bufferization.alloc_tensor(%sz) : tensor<?xf32>
 
@@ -200,7 +200,7 @@ func.func @read_of_alias(%t: tensor<100xf32>, %pos1: index, %pos2: index,
 // CHECK-LABEL: func @from_unranked_to_unranked(
 //  CHECK-SAME:     %[[arg0:.*]]: tensor<*xi32>
 func.func @from_unranked_to_unranked(%arg0: tensor<*xi32>) -> tensor<*xi32> {
-  // CHECK: %[[m:.*]] = bufferization.to_memref %[[arg0]] : memref<*xi32>
+  // CHECK: %[[m:.*]] = bufferization.to_memref %[[arg0]] : tensor<*xi32> to memref<*xi32>
   // CHECK: %[[t:.*]] = bufferization.to_tensor %[[m]]
   // CHECK: return %[[t]] : tensor<*xi32>
   %0 = tensor.cast %arg0 : tensor<*xi32> to tensor<*xi32>
@@ -227,7 +227,7 @@ func.func @tensor_copy(%arg0: tensor<5xf32>) -> tensor<5xf32> {
 
 // CHECK-LABEL: func @materialize_in_destination_buffer(
 //  CHECK-SAME:     %[[t:.*]]: tensor<5xf32>, %[[m:.*]]: memref<5xf32>)
-//       CHECK:   %[[b:.*]] = bufferization.to_memref %[[t]] : memref<5xf32, strided<[?], offset: ?>>
+//       CHECK:   %[[b:.*]] = bufferization.to_memref %[[t]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>>
 //       CHECK:   memref.copy %[[b]], %[[m]]
 func.func @materialize_in_destination_buffer(%t: tensor<5xf32>, %m: memref<5xf32>) {
   bufferization.materialize_in_destination %t in restrict writable %m
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
index 35b28f7ec83919..2ca7f7109005cc 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
@@ -1064,7 +1064,7 @@ func.func @main_func(%A : tensor<?xf32> {bufferization.writable = true},
 func.func @to_tensor_op_not_writable(%m: memref<?xf32>, %v:  vector<5xf32>,
                                      %idx1: index, %idx2: index)
     -> vector<10xf32> {
-  %0 = bufferization.to_tensor %m restrict : memref<?xf32>
+  %0 = bufferization.to_tensor %m restrict : memref<?xf32> to tensor<?xf32>
 
   // Write to the tensor. Cannot be inplace due to tensor_load.
   //      CHECK: vector.transfer_write
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir
index 7685f2ef3aafe5..230a0ed4294899 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-force-copy-before-write.mlir
@@ -28,9 +28,9 @@ module {
   // CHECK_COPY:           memref.copy
 
   func.func @contains_to_memref_op(%arg0: tensor<?xf32> {bufferization.writable = true}, %arg1: index) -> vector<5xf32> {
-    %0 = bufferization.to_memref %arg0 : memref<?xf32>
+    %0 = bufferization.to_memref %arg0 : tensor<?xf32> to memref<?xf32>
     %cst = arith.constant 0.000000e+00 : f32
     %1 = vector.transfer_read %0[%arg1], %cst : memref<?xf32>, vector<5xf32>
     return %1 : vector<5xf32>
   }
-}
\ No newline at end of file
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir
index d773e1af43a76e..29714e61d336ac 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-invalid.mlir
@@ -76,7 +76,7 @@ func.func @scf_while_non_equiv_yield(%arg0: tensor<5xi1>,
 
 func.func @to_tensor_op_unsupported(%m: memref<?xf32>, %idx: index) -> (f32) {
   // expected-error @+1 {{to_tensor ops without `restrict` are not supported by One-Shot Analysis}}
-  %0 = bufferization.to_tensor %m : memref<?xf32>
+  %0 = bufferization.to_tensor %m : memref<?xf32> to tensor<?xf32>
 
   %1 = tensor.extract %0[%idx] : tensor<?xf32>
   return %1 : f32
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
index 65557a68d243a2..ec2fb58ee03f8a 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize.mlir
@@ -679,7 +679,7 @@ func.func @to_memref_op_unsupported(
   // to_memref op.
   // CHECK: %[[alloc:.*]] = memref.alloc
   // CHECK: memref.copy %[[arg0]], %[[alloc]]
-  %0 = bufferization.to_memref %t1 : memref<?xf32>
+  %0 = bufferization.to_memref %t1 : tensor<?xf32> to memref<?xf32>
   // CHECK: "test.foo"(%[[alloc]])
   "test.foo"(%0) : (memref<?xf32>) -> ()
 
diff --git a/mlir/test/Dialect/Bufferization/canonicalize.mlir b/mlir/test/Dialect/Bufferization/canonicalize.mlir
index b6c0a0e25efe0e..3ebc1e4fa8dea3 100644
--- a/mlir/test/Dialect/Bufferization/canonicalize.mlir
+++ b/mlir/test/Dialect/Bufferization/canonicalize.mlir
@@ -6,8 +6,8 @@
 // Basic folding of to_tensor(to_memref(t)) -> t
 // CHECK-LABEL: func @tensor_load_of_buffer_cast(
 func.func @tensor_load_of_buffer_cast(%arg0: tensor<?xf32>) -> tensor<?xf32> {
-  %0 = bufferization.to_memref %arg0 : memref<?xf32>
-  %1 = bufferization.to_tensor %0 : memref<?xf32>
+  %0 = bufferization.to_memref %arg0 : tensor<?xf32> to memref<?xf32>
+  %1 = bufferization.to_tensor %0 : memref<?xf32> to tensor<?xf32>
   return %1 : tensor<?xf32>
 }
 // CHECK-SAME:   %[[TENSOR:.*]]: tensor<?xf32>) -> tensor<?xf32> {
@@ -18,8 +18,8 @@ func.func @tensor_load_of_buffer_cast(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 // Basic folding of to_memref(to_tensor(m)) -> m
 // CHECK-LABEL: func @buffer_cast_of_tensor_load(
 func.func @buffer_cast_of_tensor_load(%arg0: memref<?xf32>) -> memref<?xf32> {
-  %0 = bufferization.to_tensor %arg0 : memref<?xf32>
-  %1 = bufferization.to_memref %0 : memref<?xf32>
+  %0 = bufferization.to_tensor %arg0 : memref<?xf32> to tensor<?xf32>
+  %1 = bufferization.to_memref %0 : tensor<?xf32> to memref<?xf32>
   return %1 : memref<?xf32>
 }
 // CHECK-SAME:   %[[MEMREF:.*]]: memref<?xf32>) -> memref<?xf32> {
@@ -34,14 +34,14 @@ func.func @buffer_cast_of_tensor_load(%arg0: memref<?xf32>) -> memref<?xf32> {
 //  CHECK-SAME:   %[[MEMREF_ADDRSPACE2:.*]]: memref<?xf32, 2>)
 //  CHECK-SAME:     -> memref<?xf32, 7> {
 //       CHECK: %[[TENSOR:.*]] = bufferization.to_tensor
-//  CHECK-SAME:   %[[MEMREF_ADDRSPACE2]] : memref<?xf32, 2>
+//  CHECK-SAME:   %[[MEMREF_ADDRSPACE2]] : memref<?xf32, 2> to tensor<?xf32, 7 : i64>
 //       CHECK: %[[MEMREF_ADDRSPACE7:.*]] = bufferization.to_memref
-//  CHECK-SAME:   %[[TENSOR]] : memref<?xf32, 7>
+//  CHECK-SAME:   %[[TENSOR]] : tensor<?xf32, 7 : i64> to memref<?xf32, 7>
 //       CHECK: return %[[MEMREF_ADDRSPACE7]]
 func.func @no_fold_buffer_cast_of_tensor_load(%arg0: memref<?xf32, 2>)
     -> memref<?xf32, 7> {
-  %0 = bufferization.to_tensor %arg0 : memref<?xf32, 2>
-  %1 = bufferization.to_memref %0 : memref<?xf32, 7>
+  %0 = bufferization.to_tensor %arg0 : memref<?xf32, 2> to tensor<?xf32, 7>
+  %1 = bufferization.to_memref %0 : tensor<?xf32, 7> to memref<?xf32, 7>
   return %1 : memref<?xf32, 7>
 }
 
@@ -61,8 +61,8 @@ func.func @canonicalize_buffer_cast_of_tensor_load(
   %arg0: memref<?xf32, strided<[1], offset: 3>>)
   -> memref<?xf32, strided<[1], offset: ?>>
 {
-  %0 = bufferization.to_tensor %arg0 : memref<?xf32, strided<[1], offset: 3>>
-  %1 = bufferization.to_memref %0 : memref<?xf32, strided<[1], offset: ?>>
+  %0 = bufferization.to_tensor %arg0 : memref<?xf32, strided<[1], offset: 3>> to tensor<?xf32>
+  %1 = bufferization.to_memref %0 : tensor<?xf32> to memref<?xf32, strided<[1], offset: ?>>
   return %1 : memref<?xf32, strided<[1], offset: ?>>
 }
 
@@ -74,8 +74,8 @@ func.func @canonicalize_buffer_cast_of_tensor_load(
 func.func @canonicalize_buffer_cast_of_tensor_load_to_copy(
   %arg0: memref<?xf32, strided<[1], offset: ?>>)
   -> memref<?xf32, strided<[1], offset: 3>> {
-  %0 = bufferization.to_tensor %arg0 : memref<?xf32, strided<[1], offset: ?>>
-  %1 = bufferization.to_memref %0 : memref<?xf32, strided<[1], offset: 3>>
+  %0 = bufferization.to_tensor %arg0 : memref<?xf32, strided<[1], offset: ?>> to tensor<?xf32>
+  %1 = bufferization.to_memref %0 : tensor<?xf32> to memref<?xf32, strided<[1], offset: 3>>
   return %1 : memref<?xf32, strided<[1], offset: 3>>
 }
 // CHECK-SAME:   %[[M:.*]]: memref<?xf32, strided<[1], offset: ?>>)
@@ -100,7 +100,7 @@ func.func @canonicalize_buffer_cast_of_tensor_load_to_copy(
 //       CHECK:   return %[[D]] : index
 func.func @dim_of_tensor_load(%arg0: memref<?xf32>) -> index {
   %c0 = arith.constant 0 : index
-  %0 = bufferization.to_tensor %arg0 : memref<?xf32>
+  %0 = bufferization.to_tensor %arg0 : memref<?xf32> to tensor<?xf32>
   %1 = tensor.dim %0, %c0 : tensor<?xf32>
   return %1 : index
 }
@@ -252,10 +252,10 @@ func.func @clone_and_preceding_dealloc(%arg0: memref<?xf32>) -> memref<32xf32> {
 func.func @tensor_cast_to_memref(%arg0 : tensor<4x6x16x32xi8>) ->
   memref<?x?x16x32xi8> {
   %0 = tensor.cast %arg0 : tensor<4x6x16x32xi8> to tensor<?x?x16x32xi8>
-  %1 = bufferization.to_memref %0 : memref<?x?x16x32xi8>
+  %1 = bufferization.to_memref %0 : tensor<?x?x16x32xi8> to memref<?x?x16x32xi8>
   return %1 : memref<?x?x16x32xi8>
 }
-// CHECK:   %[[M:.+]] = bufferization.to_memref %[[ARG0]] : memref<4x6x16x32xi8>
+// CHECK:   %[[M:.+]] = bufferization.to_memref %[[ARG0]] : tensor<4x6x16x32xi8>
 // CHECK:   %[[M1:.+]] = memref.cast %[[M]]
 // CHECK-SAME: memref<4x6x16x32xi8> to memref<?x?x16x32xi8>
 // CHECK:   return %[[M1]] : memref<?x?x16x32xi8>
@@ -266,7 +266,7 @@ func.func @tensor_cast_to_memref(%arg0 : tensor<4x6x16x32xi8>) ->
 // CHECK-LABEL: func @load_from_buffer_cast(
 func.func @load_from_buffer_cast(%arg0: index, %arg1: index,
                             %arg2: tensor<?x?xf32>) -> f32 {
-  %0 = bufferization.to_memref %arg2 : memref<?x?xf32>
+  %0 = bufferization.to_memref %arg2 : tensor<?x?xf32> to memref<?x?xf32>
   %1 = memref.load %0[%arg0, %arg1] : memref<?x?xf32>
   return %1 : f32
 }
diff --git a/mlir/test/Dialect/Bufferization/ops.mlir b/mlir/test/Dialect/Bufferization/ops.mlir
index ad4a66c1b79782..7b6a6f492d0698 100644
--- a/mlir/test/Dialect/Bufferization/ops.mlir
+++ b/mlir/test/Dialect/Bufferization/ops.mlir
@@ -15,15 +15,15 @@ func.func @test_clone(%buf : memref<*xf32>) -> memref<*xf32> {
 func.func @test_to_memref(%arg0: tensor<?xi64>, %arg1: tensor<*xi64>)
     -> (memref<?xi64, affine_map<(d0) -> (d0 + 7)>>, memref<*xi64, 1>) {
   %0 = bufferization.to_memref %arg0
-    : memref<?xi64, affine_map<(d0) -> (d0 + 7)>>
+    : tensor<?xi64> to memref<?xi64, affine_map<(d0) -> (d0 + 7)>>
   %1 = bufferization.to_memref %arg1
-    : memref<*xi64, 1>
+    : tensor<*xi64> to memref<*xi64, 1>
   return %0, %1 : memref<?xi64, affine_map<(d0) -> (d0 + 7)>>, memref<*xi64, 1>
 }
 
 // CHECK-LABEL: func @test_to_tensor
 func.func @test_to_tensor(%buf : memref<2xf32>) -> tensor<2xf32> {
-  %tensor = bufferization.to_tensor %buf restrict writable : memref<2xf32>
+  %tensor = bufferization.to_tensor %buf restrict writable : memref<2xf32> to tensor<2xf32>
   return %tensor : tensor<2xf32>
 }
 
diff --git a/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir b/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir
index b82ebdde63a1c3..f5c9f81a189973 100644
--- a/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/ControlFlow/one-shot-bufferize.mlir
@@ -3,7 +3,7 @@
 
 // CHECK-NO-FUNC-LABEL: func @br(
 //  CHECK-NO-FUNC-SAME:     %[[t:.*]]: tensor<5xf32>)
-//       CHECK-NO-FUNC:   %[[m:.*]] = bufferization.to_memref %[[t]] : memref<5xf32, strided<[?], offset: ?>>
+//       CHECK-NO-FUNC:   %[[m:.*]] = bufferization.to_memref %[[t]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>>
 //       CHECK-NO-FUNC:   %[[r:.*]] = scf.execute_region -> memref<5xf32, strided<[?], offset: ?>> {
 //       CHECK-NO-FUNC:     cf.br ^[[block:.*]](%[[m]]
 //       CHECK-NO-FUNC:   ^[[block]](%[[arg1:.*]]: memref<5xf32, strided<[?], offset: ?>>):
@@ -23,7 +23,7 @@ func.func @br(%t: tensor<5xf32>) {
 
 // CHECK-NO-FUNC-LABEL: func @cond_br(
 //  CHECK-NO-FUNC-SAME:     %[[t1:.*]]: tensor<5xf32>,
-//       CHECK-NO-FUNC:   %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<5xf32, strided<[?], offset: ?>>
+//       CHECK-NO-FUNC:   %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<5xf32> to memref<5xf32, strided<[?], offset: ?>>
 //       CHECK-NO-FUNC:   %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32>
 //       CHECK-NO-FUNC:   %[[r:.*]] = scf.execute_region -> memref<5xf32, strided<[?], offset: ?>> {
 //       CHECK-NO-FUNC:     cf.cond_br %{{.*}}, ^[[block1:.*]](%[[m1]] : {{.*}}), ^[[block2:.*]](%[[alloc]] : {{.*}})
diff --git a/mlir/test/Dialect/Linalg/bufferize.mlir b/mlir/test/Dialect/Linalg/bufferize.mlir
index f416cd9fcf0b29..530badebd5c70c 100644
--- a/mlir/test/Dialect/Linalg/bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/bufferize.mlir
@@ -12,7 +12,7 @@
 // CHECK: #map = affine_map<(d0) -> (d0)>
 // CHECK-LABEL:   func @basic(
 // CHECK-SAME:                %[[TENSOR:.*]]: tensor<4xf32>) -> tensor<4xf32> {
-// CHECK-DAG:       %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<4xf32>
+// CHECK-DAG:       %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<4xf32> to memref<4xf32>
 // CHECK-DAG:       %[[RESULT_MEMREF:.*]] = memref.alloc() {{.*}} : memref<4xf32>
 // CHECK:           linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]}
 // CHECK-SAME:      ins(%[[MEMREF]] : memref<4xf32>)
@@ -46,7 +46,7 @@ func.func @basic(%arg0: tensor<4xf32>) -> tensor<4xf32> {
 // CHECK: #map = affine_map<(d0) -> (d0)>
 // CHECK-LABEL: func @empty_tensor(
 // CHECK-SAME:      %[[IN:.*]]: tensor<?xf32>, %[[SIZE:.*]]: index)
-// CHECK-DAG:     %[[MEMREF:.*]] = bufferization.to_memref %[[IN]] : memref<?xf32>
+// CHECK-DAG:     %[[MEMREF:.*]] = bufferization.to_memref %[[IN]] : tensor<?xf32> to memref<?xf32>
 // CHECK-DAG:     %[[OUT_BUF:.*]] = memref.alloc(%[[SIZE]]) {{.*}} : memref<?xf32>
 // CHECK:         linalg.generic
 // CHECK-SAME:    ins(%[[MEMREF]] : memref<?xf32>)
@@ -105,7 +105,7 @@ func.func @multiple_results(%arg0: tensor<4xf32>) -> (tensor<4xf32>, tensor<4xf3
 // CHECK-DAG:       %[[DIM1:.*]] = tensor.dim %[[ARG]], %[[C1]] : tensor<?x?xf32>
 // CHECK-DAG:       %[[RESULT0:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref<?x?xf32>
 // CHECK-DAG:       %[[RESULT1:.*]] = memref.alloc(%[[DIM0]], %[[DIM1]]) {{.*}} : memref<?x?xf32>
-// CHECK-DAG:       %[[MEMREF_ARG:.*]] = bufferization.to_memref %[[ARG]] : memref<?x?xf32>
+// CHECK-DAG:       %[[MEMREF_ARG:.*]] = bufferization.to_memref %[[ARG]] : tensor<?x?xf32> to memref<?x?xf32>
 // CHECK:           linalg.generic
 // CHECK-SAME:      ins(%[[MEMREF_ARG]] : memref<?x?xf32>)
 // CHECK-SAME:      outs(%[[RESULT0]], %[[RESULT1]] : memref<?x?xf32>, memref<?x?xf32>)
@@ -141,8 +141,8 @@ func.func @dynamic_results(%arg0: tensor<?x?xf32>)
 // CHECK-SAME:                                   %[[ARG0_TENSOR:.*]]: tensor<2x3x4xvector<3x4xi4>>,
 // CHECK-SAME:                                   %[[ARG1_TENSOR:.*]]: tensor<3x2xf32>) -> tensor<3x2xf32> {
 // CHECK-DAG:       %[[INIT_BUFFER:.*]] = memref.alloc() {{.*}} : memref<3x2xf32>
-// CHECK-DAG:       %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0_TENSOR]] : memref<2x3x4xvector<3x4xi4>>
-// CHECK-DAG:       %[[ARG1_MEMREF:.*]] = bufferization.to_memref %[[ARG1_TENSOR]] : memref<3x2xf32>
+// CHECK-DAG:       %[[ARG0_MEMREF:.*]] = bufferization.to_memref %[[ARG0_TENSOR]] : tensor<2x3x4xvector<3x4xi4>>
+// CHECK-DAG:       %[[ARG1_MEMREF:.*]] = bufferization.to_memref %[[ARG1_TENSOR]] : tensor<3x2xf32>
 // CHECK:           memref.copy %[[ARG1_MEMREF]], %[[INIT_BUFFER]] : memref<3x2xf32> to memref<3x2xf32>
 // CHECK:           linalg.generic
 // CHECK-SAME:      ins(%[[ARG0_MEMREF]] : memref<2x3x4xvector<3x4xi4>>)
diff --git a/mlir/test/Dialect/Linalg/canonicalize.mlir b/mlir/test/Dialect/Linalg/canonicalize.mlir
index d8633f7bc59271..cd439cd23ecd0c 100644
--- a/mlir/test/Dialect/Linalg/canonicalize.mlir
+++ b/mlir/test/Dialect/Linalg/canonicalize.mlir
@@ -373,7 +373,7 @@ func.func @fill_pack_general() -> tensor<1x1x8x4x4x8xi32>{
   %9 = tensor.empty() : tensor<1x1x16x64xi32>
   %extracted_slice_15 = tensor.extract_slice %9[0, 0, 0, 0] [1, 1, 16, 64] [1, 1, 1, 1] : tensor<1x1x16x64xi32> to tensor<1x1x16x64xi32>
   %16 = linalg.fill ins(%c0_i32 : i32) outs(%extracted_slice_15 : tensor<1x1x16x64xi32>) -> tensor<1x1x16x64xi32>
-  %0 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x4x8xi32>
+  %0 = bufferization.to_tensor %alloc restrict writable : memref<1x1x8x4x4x8xi32> to tensor<1x1x8x4x4x8xi32>
   %pack_18 = tensor.pack %16 outer_dims_perm = [0, 1, 3, 2] inner_dims_pos = [2, 3] inner_tiles = [4, 8] into %0 : tensor<1x1x16x64xi32> -> tensor<1x1x8x4x4x8xi32>
   return %pack_18 : tensor<1x1x8x4x4x8xi32>
 }
@@ -921,7 +921,7 @@ func.func @erase_non_identity_noop(%arg0 : tensor<?x?xf32>, %arg1: tensor<?x?xf3
   ^bb0(%in: f32, %out: f32):
     linalg.yield %in: f32
   } -> tensor<?x?xf32>
-  return %0 : tensor<?x?xf32> 
+  return %0 : tensor<?x?xf32>
 }
 
 // Do not erase ops with buffer semantics.
@@ -1073,8 +1073,8 @@ func.func @transpose_identity_perm(%input: tensor<16x32x64xf32>,
 
 // -----
 
-func.func @transpose_transpose_cancel(%input: tensor<5x4x3xf32>, 
-                                      %init1: tensor<4x3x5xf32>, 
+func.func @transpose_transpose_cancel(%input: tensor<5x4x3xf32>,
+                                      %init1: tensor<4x3x5xf32>,
                                       %init2: tensor<5x4x3xf32>) -> tensor<5x4x3xf32> {
   // CHECK-LABEL: @transpose_transpose_cancel
   //  CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<5x4x3xf32>
diff --git a/mlir/test/Dialect/MemRef/normalize-memrefs.mlir b/mlir/test/Dialect/MemRef/normalize-memrefs.mlir
index 11114bcf2b1ab1..6d20ccbf2ca055 100644
--- a/mlir/test/Dialect/MemRef/normalize-memrefs.mlir
+++ b/mlir/test/Dialect/MemRef/normalize-memrefs.mlir
@@ -360,11 +360,11 @@ func.func @neg_map() -> memref<2x3xf32, #neg> {
 // CHECK-LABEL: func @memref_with_strided_offset
 func.func @memref_with_strided_offset(%arg0: tensor<128x512xf32>, %arg1: index, %arg2: index) -> tensor<16x512xf32> {
   %c0 = arith.constant 0 : index
-  %0 = bufferization.to_memref %arg0 : memref<128x512xf32, strided<[?, ?], offset: ?>>
+  %0 = bufferization.to_memref %arg0 : tensor<128x512xf32> to memref<128x512xf32, strided<[?, ?], offset: ?>>
   %subview = memref.subview %0[%arg2, 0] [%arg1, 512] [1, 1] : memref<128x512xf32, strided<[?, ?], offset: ?>> to memref<?x512xf32, strided<[?, ?], offset: ?>>
   // CHECK: %{{.*}} = memref.cast %{{.*}} : memref<?x512xf32, strided<[?, ?], offset: ?>> to memref<16x512xf32, strided<[?, ?], offset: ?>>
   %cast = memref.cast %subview : memref<?x512xf32, strided<[?, ?], offset: ?>> to memref<16x512xf32, strided<[?, ?], offset: ?>>
-  %1 = bufferization.to_tensor %cast : memref<16x512xf32, strided<[?, ?], offset: ?>>
+  %1 = bufferization.to_tensor %cast : memref<16x512xf32, strided<[?, ?], offset: ?>> to tensor<16x512xf32>
   return %1 : tensor<16x512xf32>
 }
 
diff --git a/mlir/test/Dialect/SCF/bufferize.mlir b/mlir/test/Dialect/SCF/bufferize.mlir
index 53fcee692226cb..6c08d9f68e8a9f 100644
--- a/mlir/test/Dialect/SCF/bufferize.mlir
+++ b/mlir/test/Dialect/SCF/bufferize.mlir
@@ -4,8 +4,8 @@
 // CHECK-SAME:             %[[PRED:.*]]: i1,
 // CHECK-SAME:             %[[TRUE_TENSOR:.*]]: tensor<?xf32>,
 // CHECK-SAME:             %[[FALSE_TENSOR:.*]]: tensor<?xf32>) -> tensor<?xf32> {
-// CHECK-DAG:       %[[TRUE_MEMREF:.*]] = bufferization.to_memref %[[TRUE_TENSOR]] : memref<?xf32>
-// CHECK-DAG:       %[[FALSE_MEMREF:.*]] = bufferization.to_memref %[[FALSE_TENSOR]] : memref<?xf32>
+// CHECK-DAG:       %[[TRUE_MEMREF:.*]] = bufferization.to_memref %[[TRUE_TENSOR]] : tensor<?xf32> to memref<?xf32>
+// CHECK-DAG:       %[[FALSE_MEMREF:.*]] = bufferization.to_memref %[[FALSE_TENSOR]] : tensor<?xf32> to memref<?xf32>
 // CHECK:           %[[RESULT_MEMREF:.*]] = scf.if %[[PRED]] -> (memref<?xf32>) {
 // CHECK:             scf.yield %[[TRUE_MEMREF]] : memref<?xf32>
 // CHECK:           } else {
@@ -29,7 +29,7 @@ func.func @if(%pred: i1, %true_val: tensor<?xf32>, %false_val: tensor<?xf32>) ->
 // CHECK-SAME:              %[[TENSOR:.*]]: tensor<f32>,
 // CHECK-SAME:              %[[LB:.*]]: index, %[[UB:.*]]: index,
 // CHECK-SAME:              %[[STEP:.*]]: index) -> tensor<f32> {
-// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<f32>
+// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<f32> to memref<f32>
 // Note: scf.for iter_args always bufferize to a memory write. This could be
 // optimized by analyzing the loop body.
 // CHECK:           %[[MEMREF_COPY:.*]] = memref.alloc()
@@ -70,7 +70,7 @@ func.func @if_correct_recursive_legalization_behavior(%pred: i1, %tensor: tensor
 // CHECK-LABEL:   func @for_correct_recursive_legalization_behavior(
 // CHECK-SAME:                                                      %[[TENSOR:.*]]: tensor<f32>,
 // CHECK-SAME:                                                      %[[INDEX:.*]]: index) -> tensor<f32> {
-// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<f32>
+// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<f32> to memref<f32>
 // Note: scf.for iter_args always bufferize to a memory write. This could be
 // optimized by analyzing the loop body.
 // CHECK:           %[[MEMREF_COPY:.*]] = memref.alloc()
@@ -78,7 +78,7 @@ func.func @if_correct_recursive_legalization_behavior(%pred: i1, %tensor: tensor
 // CHECK:           %[[RESULT:.*]] = scf.for %{{.*}} = %[[INDEX]] to %[[INDEX]] step %[[INDEX]] iter_args(%[[MEMREF_ITER:.*]] = %[[MEMREF_COPY]]) -> (memref<f32>) {
 // CHECK:             %[[TENSOR_ITER:.*]] = bufferization.to_tensor %[[MEMREF_ITER]] : memref<f32>
 // CHECK:             %[[TENSOR_MUNGED:.*]] = "test.munge_tensor"(%[[TENSOR_ITER]]) : (tensor<f32>) -> tensor<f32>
-// CHECK:             %[[MEMREF_MUNGED:.*]] = bufferization.to_memref %[[TENSOR_MUNGED]] : memref<f32>
+// CHECK:             %[[MEMREF_MUNGED:.*]] = bufferization.to_memref %[[TENSOR_MUNGED]] : tensor<f32> to memref<f32>
 // CHECK:             scf.yield %[[MEMREF_MUNGED]] : memref<f32>
 // CHECK:           }
 // CHECK:           %[[TENSOR:.*]] = bufferization.to_tensor %[[RESULT]] : memref<f32>
@@ -96,7 +96,7 @@ func.func @for_correct_recursive_legalization_behavior(%arg0: tensor<f32>, %inde
 
 // CHECK-LABEL:   func @bufferize_while(
 // CHECK-SAME: %[[ARG0:.*]]: i64, %[[ARG1:.*]]: i64, %[[ARG2:.*]]: tensor<f32>
-// CHECK: %[[M:.*]] = bufferization.to_memref %[[ARG2]] : memref<f32>
+// CHECK: %[[M:.*]] = bufferization.to_memref %[[ARG2]] : tensor<f32> to memref<f32>
 // Note: scf.while iter_args always bufferize to a memory write. This could be
 // optimized by analyzing the loop body.
 // CHECK:           %[[MEMREF_COPY:.*]] = memref.alloc()
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir
new file mode 100644
index 00000000000000..709943e5965858
--- /dev/null
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-encodings.mlir
@@ -0,0 +1,73 @@
+// RUN: mlir-opt %s -one-shot-bufferize="use-encoding-for-memory-space allow-return-allocs-from-loops allow-unknown-ops" -allow-unregistered-dialect -split-input-file | FileCheck %s
+
+// Here and below, unknown op 'some.use' will force 'bufferization.to_tensor' operations to remain in the body,
+// allowing us to check that the encoding on the '%iter' tensor is correctly preserved.
+
+func.func @scf_for_iter_arg(%arg0: tensor<128xf32, 1>, %arg1: index, %arg2: index, %arg3: index) -> tensor<128xf32, 1> {
+  %0 = scf.for %i = %arg1 to %arg2 step %arg3 iter_args(%iter = %arg0) -> tensor<128xf32, 1> {
+    %0 = "some.use"(%iter) : (tensor<128xf32, 1>) -> tensor<128xf32, 1>
+    scf.yield %0 : tensor<128xf32, 1>
+  }
+  return %0 : tensor<128xf32, 1>
+}
+
+// CHECK-LABEL: func.func @scf_for_iter_arg
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>, %[[arg1:.+]]: index, %[[arg2:.+]]: index, %[[arg3:.+]]: index)
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {alignment = 64 : i64} : memref<128xf32, 1>
+//       CHECK:     memref.copy %[[v0]], %[[alloc]] : memref<128xf32, strided<[?], offset: ?>, 1> to memref<128xf32, 1>
+//       CHECK:     %[[cast:.+]] = memref.cast %[[alloc]] : memref<128xf32, 1> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v1:.+]] = scf.for %{{.+}} = %[[arg1]] to %[[arg2]] step %[[arg3]] iter_args(%[[arg6:.+]] = %[[cast]]) -> (memref<128xf32, strided<[?], offset: ?>, 1>)
+//  CHECK-NEXT:       %[[v3:.+]] = bufferization.to_tensor %[[arg6]] : memref<128xf32, strided<[?], offset: ?>, 1> to tensor<128xf32, 1 : i64>
+//  CHECK-NEXT:       %[[v4:.+]] = "some.use"(%[[v3]]) : (tensor<128xf32, 1 : i64>) -> tensor<128xf32, 1 : i64>
+//  CHECK-NEXT:       %[[v5:.+]] = bufferization.to_memref %[[v4]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1>
+//  CHECK-NEXT:       scf.yield %[[v5]] : memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v2:.+]] = bufferization.to_tensor %[[v1]] : memref<128xf32, strided<[?], offset: ?>, 1> to tensor<128xf32, 1 : i64>
+//       CHECK:     return %[[v2]] : tensor<128xf32, 1 : i64>
+
+// -----
+
+func.func @scf_forall(
+    %idx: index,
+    %idx2: index,
+    %arg1: tensor<?xf32, 1>,
+    %arg2: tensor<?xf32, 1>) -> (tensor<?xf32, 1>) {
+  %cst = arith.constant 4.200000e+01 : f32
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %2 = scf.forall (%arg3) in (%idx2) shared_outs(%o = %arg2) -> (tensor<?xf32, 1>) {
+      %8 = "some.use"(%o) : (tensor<?xf32, 1>) -> tensor<?xf32, 1>
+      scf.forall.in_parallel {
+        tensor.parallel_insert_slice %8 into %o[5] [%idx] [%c1] :
+          tensor<?xf32, 1> into tensor<?xf32, 1>
+      }
+  }
+  return %2 : tensor<?xf32, 1>
+}
+
+// CHECK-LABEL: func.func @scf_forall
+//       CHECK:     scf.forall
+//       CHECK:       %[[v2:.+]] = bufferization.to_tensor %{{.+}} : memref<?xf32, 1> to tensor<?xf32, 1 : i64>
+//       CHECK:       %[[v3:.+]] = "some.use"(%[[v2]]) : (tensor<?xf32, 1 : i64>) -> tensor<?xf32, 1 : i64>
+//       CHECK:       bufferization.to_memref %[[v3]] : tensor<?xf32, 1 : i64> to memref<?xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v1:.+]] = bufferization.to_tensor %{{.+}} : memref<?xf32, 1> to tensor<?xf32, 1 : i64>
+//       CHECK:     return %[[v1]] : tensor<?xf32, 1 : i64>
+
+// -----
+
+func.func @scf_execute_region(%arg0: tensor<128xf32, 1>) -> tensor<128xf32, 1> {
+  %0 = scf.execute_region -> tensor<128xf32, 1> {
+    scf.yield %arg0 : tensor<128xf32, 1>
+  }
+  %1 = "some.use"(%0) : (tensor<128xf32, 1>) -> tensor<128xf32, 1>
+  return %1 : tensor<128xf32, 1>
+}
+
+// CHECK-LABEL: func.func @scf_execute_region
+//  CHECK-SAME: (%[[arg0:.+]]: tensor<128xf32, 1 : i64>)
+//       CHECK:     %[[v0:.+]] = bufferization.to_memref %[[arg0]] : tensor<128xf32, 1 : i64> to memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v1:.+]] = scf.execute_region -> memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:       scf.yield %[[v0]] : memref<128xf32, strided<[?], offset: ?>, 1>
+//       CHECK:     %[[v2:.+]] = bufferization.to_tensor %[[v1]] : memref<128xf32, strided<[?], offset: ?>, 1> to tensor<128xf32, 1 : i64>
+//       CHECK:     %[[v3:.+]] = "some.use"(%[[v2]]) : (tensor<128xf32, 1 : i64>) -> tensor<128xf32, 1 : i64>
+//       CHECK:     return %[[v3]] : tensor<128xf32, 1 : i64>
diff --git a/mlir/test/Dialect/Shape/bufferize.mlir b/mlir/test/Dialect/Shape/bufferize.mlir
index 9f30a052208f0b..02e147d917d0f9 100644
--- a/mlir/test/Dialect/Shape/bufferize.mlir
+++ b/mlir/test/Dialect/Shape/bufferize.mlir
@@ -6,7 +6,7 @@
 // CHECK:           %[[WTRUE:.*]] = shape.const_witness true
 // CHECK:           %[[MEMREF:.*]] = shape.assuming %[[WTRUE]] -> (memref<2xf16>) {
 // CHECK:             %[[TENSOR_VAL:.*]] = "test.source"() : () -> tensor<2xf16>
-// CHECK:             %[[YIELDED_MEMREF:.*]] = bufferization.to_memref %[[TENSOR_VAL]] : memref<2xf16>
+// CHECK:             %[[YIELDED_MEMREF:.*]] = bufferization.to_memref %[[TENSOR_VAL]] : tensor<2xf16> to memref<2xf16>
 // CHECK:             shape.assuming_yield %[[YIELDED_MEMREF]] : memref<2xf16>
 // CHECK:           }
 // CHECK:           %[[TENSOR:.*]] = bufferization.to_tensor %[[MEMREF:.*]] : memref<2xf16>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
index 8293169049ca61..6d98667e775634 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul24_lib.mlir
@@ -14,19 +14,19 @@
 // CHECK-SAME:      %[[VAL_2:.*2]]: tensor<?x?xf16>) -> tensor<?x?xf16> {
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?xf16>
+// CHECK:           %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<?x?xf16> to memref<?x?xf16>
 // CHECK:           %[[VAL_6:.*]] = gpu.wait async
 // CHECK:           %[[VAL_7:.*]] = memref.dim %[[VAL_5]], %[[VAL_3]] : memref<?x?xf16>
 // CHECK:           %[[VAL_8:.*]] = memref.dim %[[VAL_5]], %[[VAL_4]] : memref<?x?xf16>
 // CHECK:           %[[VAL_9:.*]], %[[VAL_10:.*]] = gpu.alloc async {{\[}}%[[VAL_6]]] (%[[VAL_7]], %[[VAL_8]]) : memref<?x?xf16>
 // CHECK:           %[[VAL_11:.*]] = gpu.memcpy async {{\[}}%[[VAL_10]]] %[[VAL_9]], %[[VAL_5]] : memref<?x?xf16>, memref<?x?xf16>
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf16>
+// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?x?xf16> to memref<?x?xf16>
 // CHECK:           %[[VAL_13:.*]] = gpu.wait async
 // CHECK:           %[[VAL_14:.*]] = memref.dim %[[VAL_12]], %[[VAL_3]] : memref<?x?xf16>
 // CHECK:           %[[VAL_15:.*]] = memref.dim %[[VAL_12]], %[[VAL_4]] : memref<?x?xf16>
 // CHECK:           %[[VAL_16:.*]], %[[VAL_17:.*]] = gpu.alloc async {{\[}}%[[VAL_13]]] (%[[VAL_14]], %[[VAL_15]]) : memref<?x?xf16>
 // CHECK:           %[[VAL_18:.*]] = gpu.memcpy async {{\[}}%[[VAL_17]]] %[[VAL_16]], %[[VAL_12]] : memref<?x?xf16>, memref<?x?xf16>
-// CHECK:           %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf16>
+// CHECK:           %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?x?xf16> to memref<?x?xf16>
 // CHECK:           %[[VAL_20:.*]] = gpu.wait async
 // CHECK:           %[[VAL_21:.*]] = memref.dim %[[VAL_19]], %[[VAL_3]] : memref<?x?xf16>
 // CHECK:           %[[VAL_22:.*]] = memref.dim %[[VAL_19]], %[[VAL_4]] : memref<?x?xf16>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
index 699508acf13cf7..63c308a3d5e6f0 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -30,13 +30,13 @@
 // CHECK:           %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?xf64>
 // CHECK:           %[[VAL_24:.*]], %[[VAL_25:.*]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref<?xf64>
 // CHECK:           %[[VAL_26:.*]] = gpu.memcpy async {{\[}}%[[VAL_25]]] %[[VAL_24]], %[[VAL_11]] : memref<?xf64>, memref<?xf64>
-// CHECK:           %[[VAL_27:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
+// CHECK:           %[[VAL_27:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?x?xf64> to memref<?x?xf64>
 // CHECK:           %[[VAL_28:.*]] = gpu.wait async
 // CHECK:           %[[VAL_29:.*]] = memref.dim %[[VAL_27]], %[[VAL_3]] : memref<?x?xf64>
 // CHECK:           %[[VAL_30:.*]] = memref.dim %[[VAL_27]], %[[VAL_4]] : memref<?x?xf64>
 // CHECK:           %[[VAL_31:.*]], %[[VAL_32:.*]] = gpu.alloc async {{\[}}%[[VAL_28]]] (%[[VAL_29]], %[[VAL_30]]) : memref<?x?xf64>
 // CHECK:           %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_27]] : memref<?x?xf64>, memref<?x?xf64>
-// CHECK:           %[[VAL_34:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf64>
+// CHECK:           %[[VAL_34:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?x?xf64> to memref<?x?xf64>
 // CHECK:           %[[VAL_35:.*]] = gpu.wait async
 // CHECK:           %[[VAL_36:.*]] = memref.dim %[[VAL_34]], %[[VAL_3]] : memref<?x?xf64>
 // CHECK:           %[[VAL_37:.*]] = memref.dim %[[VAL_34]], %[[VAL_4]] : memref<?x?xf64>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
index 14a0e9c30b2016..088e468cee7954 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir
@@ -30,12 +30,12 @@ module {
 // CHECK:           %[[VAL_22:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xf64>
 // CHECK:           %[[VAL_23:.*]], %[[VAL_24:.*]] = gpu.alloc async {{\[}}%[[VAL_21]]] (%[[VAL_22]]) : memref<?xf64>
 // CHECK:           %[[VAL_25:.*]] = gpu.memcpy async {{\[}}%[[VAL_24]]] %[[VAL_23]], %[[VAL_10]] : memref<?xf64>, memref<?xf64>
-// CHECK:           %[[VAL_26:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?xf64>
+// CHECK:           %[[VAL_26:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?xf64> to memref<?xf64>
 // CHECK:           %[[VAL_27:.*]] = gpu.wait async
 // CHECK:           %[[VAL_28:.*]] = memref.dim %[[VAL_26]], %[[VAL_3]] : memref<?xf64>
 // CHECK:           %[[VAL_29:.*]], %[[VAL_30:.*]] = gpu.alloc async {{\[}}%[[VAL_27]]] (%[[VAL_28]]) : memref<?xf64>
 // CHECK:           %[[VAL_31:.*]] = gpu.memcpy async {{\[}}%[[VAL_30]]] %[[VAL_29]], %[[VAL_26]] : memref<?xf64>, memref<?xf64>
-// CHECK:           %[[VAL_32:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?xf64>
+// CHECK:           %[[VAL_32:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?xf64> to memref<?xf64>
 // CHECK:           %[[VAL_33:.*]] = gpu.wait async
 // CHECK:           %[[VAL_34:.*]] = memref.dim %[[VAL_32]], %[[VAL_3]] : memref<?xf64>
 // CHECK:           %[[VAL_35:.*]], %[[VAL_36:.*]] = gpu.alloc async {{\[}}%[[VAL_33]]] (%[[VAL_34]]) : memref<?xf64>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
index 97f36d49927bf9..1058bc03fe9cb9 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
@@ -28,11 +28,11 @@
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 8 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_5:.*]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<8x8xf64, #sparse{{[0-9]*}}>
-// CHECK:           %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>
+// CHECK:           %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64>
 // CHECK:           %[[VAL_7:.*]] = gpu.wait async
 // CHECK:           %[[VAL_8:.*]], %[[VAL_9:.*]] = gpu.alloc async {{\[}}%[[VAL_7]]] () : memref<8x8xf64>
 // CHECK:           %[[VAL_10:.*]] = gpu.memcpy async {{\[}}%[[VAL_9]]] %[[VAL_8]], %[[VAL_6]] : memref<8x8xf64>, memref<8x8xf64>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64>
+// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64>
 // CHECK:           %[[VAL_12:.*]] = gpu.wait async
 // CHECK:           %[[VAL_13:.*]], %[[VAL_14:.*]] = gpu.alloc async {{\[}}%[[VAL_12]]] () : memref<8x8xf64>
 // CHECK:           %[[VAL_15:.*]] = gpu.memcpy async {{\[}}%[[VAL_14]]] %[[VAL_13]], %[[VAL_11]] : memref<8x8xf64>, memref<8x8xf64>
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
index 93f49002a47d2d..32741086b9e6eb 100644
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sddmm_lib.mlir
@@ -30,13 +30,13 @@
 // CHECK:           %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?xf32>
 // CHECK:           %[[VAL_9:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf32>
 // CHECK:           %[[VAL_10:.*]] = tensor.dim %[[VAL_2]], %[[VAL_4]] : tensor<?x?xf32>
-// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
+// CHECK:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?x?xf32> to memref<?x?xf32>
 // CHECK:           %[[VAL_12:.*]] = gpu.wait async
 // CHECK:           %[[VAL_13:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?x?xf32>
 // CHECK:           %[[VAL_14:.*]] = memref.dim %[[VAL_11]], %[[VAL_4]] : memref<?x?xf32>
 // CHECK:           %[[VAL_15:.*]], %[[VAL_16:.*]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]], %[[VAL_14]]) : memref<?x?xf32>
 // CHECK:           %[[VAL_17:.*]] = gpu.memcpy async {{\[}}%[[VAL_16]]] %[[VAL_15]], %[[VAL_11]] : memref<?x?xf32>, memref<?x?xf32>
-// CHECK:           %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
+// CHECK:           %[[VAL_18:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?x?xf32> to memref<?x?xf32>
 // CHECK:           %[[VAL_19:.*]] = gpu.wait async
 // CHECK:           %[[VAL_20:.*]] = memref.dim %[[VAL_18]], %[[VAL_3]] : memref<?x?xf32>
 // CHECK:           %[[VAL_21:.*]] = memref.dim %[[VAL_18]], %[[VAL_4]] : memref<?x?xf32>
diff --git a/mlir/test/Dialect/SparseTensor/constant_index_map.mlir b/mlir/test/Dialect/SparseTensor/constant_index_map.mlir
index f9559ce648c783..857967bcf521ab 100644
--- a/mlir/test/Dialect/SparseTensor/constant_index_map.mlir
+++ b/mlir/test/Dialect/SparseTensor/constant_index_map.mlir
@@ -14,8 +14,8 @@
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = tensor.empty() : tensor<77xi1, #{{.*}}>
-// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<1x77xi1>
-// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<1x77xi1>
+// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<1x77xi1>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<1x77xi1>
 // CHECK:           %[[VAL_8:.*]] = scf.for %[[VAL_9:.*]] = %[[VAL_3]] to %[[VAL_2]] step %[[VAL_4]] iter_args(%[[VAL_10:.*]] = %[[VAL_5]]) -> (tensor<77xi1, #{{.*}}>) {
 // CHECK:             %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]], %[[VAL_9]]] : memref<1x77xi1>
 // CHECK:             %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_3]], %[[VAL_9]]] : memref<1x77xi1>
diff --git a/mlir/test/Dialect/SparseTensor/dense.mlir b/mlir/test/Dialect/SparseTensor/dense.mlir
index 60a217e05e61ec..5ed1558a53163c 100644
--- a/mlir/test/Dialect/SparseTensor/dense.mlir
+++ b/mlir/test/Dialect/SparseTensor/dense.mlir
@@ -40,7 +40,7 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
+// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK:           scf.for %[[VAL_9:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_11:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index
 // CHECK:             scf.for %[[VAL_10:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
@@ -79,7 +79,7 @@ func.func @dense1(%arga: tensor<32x16xf32, #DenseMatrix>,
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32x16xf32>
+// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
 // CHECK:           scf.for %[[VAL_9:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_11:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index
@@ -122,7 +122,7 @@ func.func @dense2(%arga: tensor<32x16xf32>,
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 16 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32x16x8xf32>
+// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK:           %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
 // CHECK:           scf.for %[[VAL_9:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_11:.*]] = arith.muli %[[VAL_9]], %[[VAL_4]] : index
diff --git a/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir b/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir
index 0ef143a1a2f38c..275f7f2ff25f7e 100644
--- a/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir
+++ b/mlir/test/Dialect/SparseTensor/fuse_sparse_pad_with_consumer.mlir
@@ -30,7 +30,7 @@
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<4x4xf32, #sparse> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<4x4xf32, #sparse> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<4x4xf32, #sparse> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_10]] : memref<8x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_10]] :
 // CHECK-DAG:       linalg.fill ins(%[[VAL_8]] : f32) outs(%[[VAL_14]] : memref<8x8xf32>)
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_7]] : index
@@ -54,7 +54,7 @@
 // CHECK:               memref.store %[[VAL_30]], %[[VAL_14]]{{\[}}%[[VAL_15]], %[[VAL_27]]] : memref<8x8xf32>
 // CHECK:             } {"Emitted from" = "linalg.generic"}
 // CHECK:           } {"Emitted from" = "linalg.generic"}
-// CHECK:           %[[VAL_31:.*]] = bufferization.to_tensor %[[VAL_14]] : memref<8x8xf32>
+// CHECK:           %[[VAL_31:.*]] = bufferization.to_tensor %[[VAL_14]] :
 // CHECK:           return %[[VAL_31]] : tensor<8x8xf32>
 // CHECK:         }
 func.func @padded_mul(%arg0: tensor<4x4xf32, #CSR>, %arg1: tensor<8x8xf32>) -> tensor<8x8xf32> {
diff --git a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
index 2b9a2dd8f4883d..58f182dbdc44d1 100644
--- a/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
+++ b/mlir/test/Dialect/SparseTensor/sorted_coo.mlir
@@ -101,7 +101,7 @@ func.func @sparse_scale(%argx: tensor<?x?xf32, #SortedCOO>) -> tensor<?x?xf32, #
 // C_HECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref<?xindex, strided<[?], offset: ?>>
 // C_HECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref<?xindex, strided<[?], offset: ?>>
 // C_HECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// C_HECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// C_HECK:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // C_HECK:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // C_HECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // C_HECK:           %[[VAL_13:.*]] = scf.while (%[[VAL_14:.*]] = %[[VAL_11]]) : (index) -> index {
@@ -170,7 +170,7 @@ func.func @matvec(%arga: tensor<32x64xf64, #SortedCOO>,
 // C_HECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref<?xindex, strided<[?], offset: ?>>
 // C_HECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref<?xindex, strided<[?], offset: ?>>
 // C_HECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x64xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// C_HECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x64xf64>
+// C_HECK:           %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x64xf64> to memref<32x64xf64>
 // C_HECK:           linalg.fill ins(%[[VAL_4]] : f64) outs(%[[VAL_15]] : memref<32x64xf64>)
 // C_HECK:           %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // C_HECK:           %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
index fcc221660353a1..003dcc6708d634 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_1d.mlir
@@ -51,7 +51,7 @@ func.func @add_d(%arga: tensor<32xf32, #DV>, %argb: f32, %argx: tensor<32xf32>)
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK:           %[[VAL_INITTENSOR:.*]] = tensor.empty() : tensor<32xf32>
 // CHECK:           %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_INITTENSOR]] : memref<32xf32>
+// CHECK:           %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_INITTENSOR]] : tensor<32xf32> to memref<32xf32>
 // CHECK:           linalg.fill ins(%[[VAL_3]] : f32) outs(%[[VAL_7]] : memref<32xf32>)
 // CHECK:           scf.for %[[VAL_8:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_9:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_8]]] : memref<?xf32>
@@ -247,7 +247,7 @@ func.func @mul_s(%arga: tensor<32xf32, #SV>, %argb: f32, %argx: tensor<32xf32>)
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]]
 // CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
@@ -278,7 +278,7 @@ func.func @add_dd(%arga: tensor<32xf32, #DV>, %argb: tensor<32xf32>, %argx: tens
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]]
 // CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_9]] : memref<32xf32>)
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
@@ -309,7 +309,7 @@ func.func @mul_dd(%arga: tensor<32xf32, #DV>, %argb: tensor<32xf32>, %argx: tens
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant true
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32xf32> to memref<32xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
@@ -366,7 +366,7 @@ func.func @add_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tens
 // CHECK-SAME:      %[[VAL_2:.*]]: tensor<32xf32>) -> tensor<32xf32> {
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_5:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<32xf32> to memref<32xf32>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
@@ -406,7 +406,7 @@ func.func @mul_ds(%arga: tensor<32xf32>, %argb: tensor<32xf32, #SV>, %argx: tens
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]]
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32xf32>)
 // CHECK-DAG:       %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -463,7 +463,7 @@ func.func @add_sd(%arga: tensor<32xf32, #SV>, %argb: tensor<32xf32>, %argx: tens
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]]
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32xf32>)
 // CHECK-DAG:       %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -830,7 +830,7 @@ func.func @two_way_inv_alt(%arga: tensor<16xf32, #SV>,
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<f32> to memref<f32>
 // CHECK-DAG:       %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = memref.load %[[VAL_6]][] : memref<f32>
@@ -875,7 +875,7 @@ func.func @sum_reduction(%arga: tensor<?xf32, #SV>, %argx: tensor<f32>) -> tenso
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<f32> to memref<f32>
 // CHECK-DAG:           %[[VAL_13:.*]] = memref.load %[[VAL_11]][] : memref<f32>
 // CHECK-DAG:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK-DAG:           %[[VAL_15:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -977,11 +977,11 @@ func.func @sum_reduction_ss(%arga: tensor<16xf32, #SV>,
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<f32> to memref<f32>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 0 : index} : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : tensor<f32> to memref<f32>
 // CHECK-DAG:       %[[VAL_15:.*]] = memref.load %[[VAL_13]][] : memref<f32>
 // CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_9]][] : memref<f32>
 // CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -1089,11 +1089,11 @@ func.func @sum_reduction_inv(%arga: tensor<16xf32, #SV>,
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant true
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?xf64>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<?xf64> to memref<?xf64>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?xf64>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?xf64> to memref<?xf64>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_3]] {level = 0 : index} : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.coordinates %[[VAL_3]] {level = 0 : index} : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_3]] : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xf64>
@@ -1272,7 +1272,7 @@ func.func @four_tensors_op(%arga: tensor<?xf64>,
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 0 : index} : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 0 : index} : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : memref<f64>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_3]] : tensor<f64> to memref<f64>
 // CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_15]][] : memref<f64>
 // CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_19:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
index 06670ab096fcdf..9c34e54db6c853 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_2d.mlir
@@ -25,8 +25,8 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32x16xf32>)
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index
@@ -62,8 +62,8 @@ func.func @add_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1>
 // CHECK:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_10]] : memref<32x16xi1>)
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index
@@ -98,8 +98,8 @@ func.func @cmp_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_10]] : memref<32x16xf32>)
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_4]] : index
@@ -137,8 +137,8 @@ func.func @mul_dd(%arga: tensor<32x16xf32, #Tdd>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<32x16xf32>)
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref<?xindex>
@@ -202,8 +202,8 @@ func.func @add_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>)
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<?xindex>
@@ -265,8 +265,8 @@ func.func @cmp_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32x16xf32>)
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
@@ -306,8 +306,8 @@ func.func @mul_ds(%arga: tensor<32x16xf32, #Tds>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -376,9 +376,9 @@ func.func @add_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
-// CHECK-DAG:       linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>)
+// CHECK-DAG:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:           %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1>
+// CHECK-DAG:           linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_14]] : memref<32x16xi1>)
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_15]], %[[VAL_19:.*]] = %[[VAL_6]]) : (index, index) -> (index, index) {
@@ -444,9 +444,9 @@ func.func @cmp_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
-// CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32x16xf32>)
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:           linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_11]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_12]] to %[[VAL_13]] step %[[VAL_5]] {
@@ -488,8 +488,8 @@ func.func @mul_sd(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_15]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -584,8 +584,8 @@ func.func @add_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_5]] : i1) outs(%[[VAL_16]] : memref<32x16xi1>)
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -679,8 +679,8 @@ func.func @cmp_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16xf32> to memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_12]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -726,7 +726,7 @@ func.func @mul_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32>, %arg
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_16]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -891,7 +891,7 @@ func.func @add_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:           %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xi1>
+// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xi1> to memref<32x16xi1>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_3]] : i1) outs(%[[VAL_17]] : memref<32x16xi1>)
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -1166,7 +1166,7 @@ func.func @sub_ss_batched(%0: tensor<2x3xf64, #BatchedVector>, %1: tensor<2x3xf6
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_16]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -1260,7 +1260,7 @@ func.func @mul_ss_ss(%arga: tensor<32x16xf32, #Tss>, %argb: tensor<32x16xf32, #T
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_15]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_7]]] : memref<?xindex>
@@ -1362,7 +1362,7 @@ func.func @add_sd_ds(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32, #T
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f32) outs(%[[VAL_13]] : memref<32x16xf32>)
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -1415,8 +1415,8 @@ func.func @mul_sd_ds(%arga: tensor<32x16xf32, #Tsd>, %argb: tensor<32x16xf32, #T
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<16x32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<16x32xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<16x32xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<16xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf32> to memref<32xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<16xf32> to memref<16xf32>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
 // CHECK-DAG:         %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
@@ -1464,7 +1464,7 @@ func.func @matvec(%argA: tensor<16x32xf32, #Tds>, %argb: tensor<32xf32>, %argx:
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<f32> to memref<f32>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK:           %[[VAL_10:.*]] = scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_2]] step %[[VAL_3]] iter_args(%[[VAL_12:.*]] = %[[VAL_9]]) -> (f32) {
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_11]]] : memref<?xindex>
@@ -1511,7 +1511,7 @@ func.func @sum_reduction(%arga: tensor<10x20xf32, #Tds>, %argx: tensor<f32>) ->
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse{{[0-9]*}}> to memref<?xf64>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.lvl %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?x?xf64> to memref<?x?xf64>
 // CHECK-DAG:       linalg.fill ins(%{{.*}} : f64) outs(%[[VAL_11]] : memref<?x?xf64>)
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_3]] to %[[VAL_8]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_12]]] : memref<?xindex>
@@ -1563,9 +1563,9 @@ func.func @scale(%arga: tensor<?x?xf64, #Tds>, %argx: tensor<?x?xf64>) -> tensor
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_11:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?x?xf32> to memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?x?xf32> to memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_3]] : tensor<?x?xf32> to memref<?x?xf32>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_4]] {
@@ -1638,10 +1638,10 @@ func.func @sampled_dense_dense(%args: tensor<?x?xf32, #Tss>,
 // CHECK-DAG:       %[[VAL_17:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 1 : index} : tensor<?x?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_18:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 1 : index} : tensor<?x?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_19:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<?x?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?xf32>
-// CHECK-DAG:       %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : memref<f32>
+// CHECK-DAG:       %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_3]] : tensor<?xf32> to memref<?xf32>
+// CHECK-DAG:       %[[VAL_21:.*]] = bufferization.to_memref %[[VAL_4]] : tensor<f32> to memref<f32>
 // CHECK-DAG:       %[[VAL_22:.*]] = sparse_tensor.lvl %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32,
-// CHECK-DAG:       %[[VAL_24:.*]] = bufferization.to_memref %[[VAL_5]] : memref<?xf32>
+// CHECK-DAG:       %[[VAL_24:.*]] = bufferization.to_memref %[[VAL_5]] : tensor<?xf32> to memref<?xf32>
 // CHECK:           %[[VAL_25:.*]] = memref.load %[[VAL_21]][] : memref<f32>
 // CHECK:           %[[VAL_26:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_27:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
index 427a5c3d03a730..9158ac427763b4 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_3d.mlir
@@ -33,9 +33,9 @@
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_11]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_11]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_4]] : index
 // CHECK:             scf.for %[[VAL_13:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
@@ -75,9 +75,9 @@ func.func @add_ddd(%arga: tensor<32x16x8xf32, #Tddd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_11]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_11]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_14:.*]] = arith.muli %[[VAL_12]], %[[VAL_4]] : index
 // CHECK:             scf.for %[[VAL_13:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
@@ -120,8 +120,8 @@ func.func @mul_ddd(%arga: tensor<32x16x8xf32, #Tddd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
 // CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_15]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_16:.*]] = %[[VAL_7]] to %[[VAL_4]] step %[[VAL_9]] {
 // CHECK:             %[[VAL_18:.*]] = arith.muli %[[VAL_16]], %[[VAL_5]] : index
@@ -187,9 +187,9 @@ func.func @add_dds(%arga: tensor<32x16x8xf32, #Tdds>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
 // CHECK:             %[[VAL_16:.*]] = arith.muli %[[VAL_14]], %[[VAL_5]] : index
 // CHECK:             scf.for %[[VAL_15:.*]] = %[[VAL_6]] to %[[VAL_5]] step %[[VAL_7]] {
@@ -234,9 +234,9 @@ func.func @mul_dds(%arga: tensor<32x16x8xf32, #Tdds>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_7]] to %[[VAL_3]] step %[[VAL_8]] {
 // CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref<?xindex>
 // CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_8]] : index
@@ -305,9 +305,9 @@ func.func @add_dsd(%arga: tensor<32x16x8xf32, #Tdsd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_12]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_12]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_13]]] : memref<?xindex>
 // CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_13]], %[[VAL_6]] : index
@@ -354,9 +354,9 @@ func.func @mul_dsd(%arga: tensor<32x16x8xf32, #Tdsd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_17]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_17]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_18:.*]] = %[[VAL_8]] to %[[VAL_4]] step %[[VAL_9]] {
 // CHECK:             %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_18]]] : memref<?xindex>
 // CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_18]], %[[VAL_9]] : index
@@ -450,9 +450,9 @@ func.func @add_dss(%arga: tensor<32x16x8xf32, #Tdss>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
 // CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_6]] : index
@@ -499,9 +499,9 @@ func.func @mul_dss(%arga: tensor<32x16x8xf32, #Tdss>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_8]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]]:2 = scf.while (%[[VAL_18:.*]] = %[[VAL_15]], %[[VAL_19:.*]] = %[[VAL_7]]) : (index, index) -> (index, index) {
@@ -575,9 +575,9 @@ func.func @add_sdd(%arga: tensor<32x16x8xf32, #Tsdd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_12]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_12]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_6]] {
@@ -625,9 +625,9 @@ func.func @mul_sdd(%arga: tensor<32x16x8xf32, #Tsdd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_17]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_17]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_8]]] : memref<?xindex>
 // CHECK:           %[[VAL_19:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_9]]] : memref<?xindex>
 // CHECK:           %[[VAL_20:.*]]:2 = scf.while (%[[VAL_21:.*]] = %[[VAL_18]], %[[VAL_22:.*]] = %[[VAL_8]]) : (index, index) -> (index, index) {
@@ -726,9 +726,9 @@ func.func @add_sds(%arga: tensor<32x16x8xf32, #Tsds>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_14]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_6]] {
@@ -778,9 +778,9 @@ func.func @mul_sds(%arga: tensor<32x16x8xf32, #Tsds>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_16]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_16]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_8]]] : memref<?xindex>
 // CHECK:           %[[VAL_19:.*]]:2 = scf.while (%[[VAL_20:.*]] = %[[VAL_17]], %[[VAL_21:.*]] = %[[VAL_7]]) : (index, index) -> (index, index) {
@@ -883,9 +883,9 @@ func.func @add_ssd(%arga: tensor<32x16x8xf32, #Tssd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_16:.*]] = %[[VAL_14]] to %[[VAL_15]] step %[[VAL_5]] {
@@ -937,9 +937,9 @@ func.func @mul_ssd(%arga: tensor<32x16x8xf32, #Tssd>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_16:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_19]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_19:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_19]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_20:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_8]]] : memref<?xindex>
 // CHECK:           %[[VAL_21:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_9]]] : memref<?xindex>
 // CHECK:           %[[VAL_22:.*]]:2 = scf.while (%[[VAL_23:.*]] = %[[VAL_20]], %[[VAL_24:.*]] = %[[VAL_8]]) : (index, index) -> (index, index) {
@@ -1067,9 +1067,9 @@ func.func @add_sss(%arga: tensor<32x16x8xf32, #Tsss>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16x8xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32x16x8xf32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16x8xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_15]] : memref<32x16x8xf32>)
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16x8xf32> to memref<32x16x8xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_15]] : memref<32x16x8xf32>)
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_18:.*]] = %[[VAL_16]] to %[[VAL_17]] step %[[VAL_5]] {
@@ -1127,11 +1127,11 @@ func.func @mul_sss(%arga: tensor<32x16x8xf32, #Tsss>, %argb: tensor<32x16x8xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 2 : index} : tensor<?x?x?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.lvl %[[VAL_1]], %[[VAL_6]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?x?xf32> to memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_3]] : tensor<?x?xf32> to memref<?x?xf32>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.lvl %[[VAL_1]], %[[VAL_5]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_14:.*]] = tensor.dim %[[VAL_2]], %[[VAL_6]] : tensor<?x?xf32>
-// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<?x?xf32> to memref<?x?xf32>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_5]] to %[[VAL_13]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_19:.*]] = arith.muli %[[VAL_17]], %[[VAL_10]] : index
 // CHECK:             scf.for %[[VAL_18:.*]] = %[[VAL_5]] to %[[VAL_10]] step %[[VAL_6]] {
@@ -1191,7 +1191,7 @@ func.func @kernel_3d(%arga: tensor<?x?xf32>,
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<10x20x30xf32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<10x20x30xf32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20x30xf32, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<f32> to memref<f32>
 // CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_10]][] : memref<f32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -1246,10 +1246,10 @@ func.func @sum_reduction(%arga: tensor<10x20x30xf32, #Tsss>, %argx: tensor<f32>)
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?x?xf32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<?x?x?xf32> to memref<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_9:.*]] = tensor.dim %[[VAL_0]], %[[VAL_5]] : tensor<?x?x?xf32>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<f32> to memref<f32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_12]][] : memref<f32>
 // CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_9]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (f32) {
 // CHECK:             %[[VAL_17:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<?xf32>
@@ -1305,10 +1305,10 @@ func.func @sum_reduction_inv(%arga: tensor<?x?x?xf32>,
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_8:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<30xf32>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : memref<10x20x30xf32>
-// CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<10x20x30xf32>)
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<20xf32> to memref<20xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<30xf32> to memref<30xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_3]] : tensor<10x20x30xf32> to memref<10x20x30xf32>
+// CHECK-DAG:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_13]] : memref<10x20x30xf32>)
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_7]] to %[[VAL_4]] step %[[VAL_8]] {
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_14]]] : memref<?xf32>
 // CHECK:             scf.for %[[VAL_16:.*]] = %[[VAL_7]] to %[[VAL_5]] step %[[VAL_8]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
index 1ec6fb586d434e..e2dbadc4db5bfe 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_affine.mlir
@@ -25,8 +25,8 @@
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf32, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<4xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<4xf32> to memref<4xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf32> to memref<32xf32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_4]]] : memref<4xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -112,8 +112,8 @@ func.func @mul_inv_enc_dense1d(%arga: tensor<32xf32, #EncDenseVec>,
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi32, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<34xi32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<34xi32> to memref<34xi32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi32> to memref<32xi32>
 // CHECK-DAG:       linalg.fill ins(%[[ZERO]] : i32) outs(%[[VAL_11]] : memref<32xi32>)
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -163,8 +163,8 @@ func.func @and_affine_dense1d(%arga: tensor<32xi32, #SpVec>,
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf64, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<34x19xf64>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf64>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<34x19xf64> to memref<34x19xf64>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64>
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_5]] to %[[VAL_4]] step %[[VAL_3]] {
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_14]]] : memref<?xindex>
 // CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_3]] : index
@@ -223,7 +223,7 @@ func.func @mul_affine_dense2d(%arga: tensor<32x16xf64, #CSR>,
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf64>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_5]] {
@@ -287,7 +287,7 @@ func.func @mul_affine_dense_dim_2d(%arga: tensor<34x16xf64, #CSR>,
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<32x19xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32x16xf64>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32x16xf64> to memref<32x16xf64>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_16:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_17:.*]] = %[[VAL_15]] to %[[VAL_16]] step %[[VAL_6]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_batch.mlir b/mlir/test/Dialect/SparseTensor/sparse_batch.mlir
index f158fc6108a130..cfddef743cf28e 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_batch.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_batch.mlir
@@ -14,7 +14,7 @@
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 2 : index} : tensor<8x4x2xf32, #sparse{{[0-9]*}}> to memref<8x?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 2 : index} : tensor<8x4x2xf32, #sparse{{[0-9]*}}> to memref<8x?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x4x2xf32, #sparse{{[0-9]*}}> to memref<8x?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_6]] : memref<8x4x2xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_6]] : tensor<8x4x2xf32>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_3]] : f32) outs(%[[VAL_10]] : memref<8x4x2xf32>)
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_1]] {
 // CHECK:             scf.for %[[VAL_12:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_1]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir
index b6c7b771394b15..d1d8276f8daefd 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fp_ops.mlir
@@ -38,7 +38,7 @@
 // CHECK-DAG:     %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:         %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] {
@@ -70,7 +70,7 @@ func.func @abs(%arga: tensor<32xf64, #SV>,
 // CHECK-DAG:     %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:         %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] {
@@ -99,10 +99,10 @@ func.func @ceil(%arga: tensor<32xf64, #SV>,
 // CHECK-SAME:    %[[VAL_1:.*]]: tensor<32xf64>) -> tensor<32xf64> {
 // CHECK-DAG:     %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
-// CHECK-DAG:     %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
-// CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
+// CHECK-DAG:         %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
+// CHECK-DAG:         %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
+// CHECK-DAG:         %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
+// CHECK-DAG:         %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:         %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] {
@@ -131,10 +131,10 @@ func.func @floor(%arga: tensor<32xf64, #SV>,
 // CHECK-SAME:    %[[VAL_1:.*]]: tensor<32xf64>) -> tensor<32xf64> {
 // CHECK-DAG:     %[[VAL_2:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[VAL_3:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
-// CHECK-DAG:     %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
-// CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
+// CHECK-DAG:         %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
+// CHECK-DAG:         %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
+// CHECK-DAG:         %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
+// CHECK-DAG:         %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:         %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] {
@@ -169,8 +169,8 @@ func.func @neg(%arga: tensor<32xf64, #SV>,
 // CHECK-DAG:     %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:     %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:     %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}>
-// CHECK-DAG:     %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
-// CHECK-DAG:     %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
+// CHECK-DAG:     %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:         %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:         %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
@@ -229,8 +229,8 @@ func.func @add(%arga: tensor<32xf64, #SV>,
 // CHECK-DAG:     %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:     %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
-// CHECK-DAG:     %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
+// CHECK-DAG:     %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:         %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:         %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
@@ -289,8 +289,8 @@ func.func @sub(%arga: tensor<32xf64, #SV>,
 // CHECK-DAG:     %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:     %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}>
-// CHECK-DAG:     %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
-// CHECK-DAG:     %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
+// CHECK-DAG:     %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:         %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] {
@@ -325,7 +325,7 @@ func.func @mul(%arga: tensor<32xf64, #SV>,
 // CHECK-DAG:     %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:     %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xf64, #sparse{{[0-9]*}}>
 // CHECK-DAG:     %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xf64, #sparse{{[0-9]*}}>
-// CHECK-DAG:     %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xf64>
+// CHECK-DAG:     %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xf64> to memref<32xf64>
 // CHECK:         %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:         %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:         scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir b/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir
index 50f21416f5a74b..d9f48afef48100 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_fusion.mlir
@@ -25,10 +25,10 @@
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<100xf64, #sparse> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<100xf64, #sparse> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<100xf64, #sparse> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_8]] : memref<100xf64>
-// CHECK-DAG:           linalg.fill ins(%[[VAL_4]] : f64) outs(%[[VAL_12]] : memref<100xf64>)
-// CHECK-DAG:           %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
-// CHECK-DAG:           %[[VAL_14:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref<?xindex>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_8]] :
+// CHECK-DAG:        linalg.fill ins(%[[VAL_4]] : f64) outs(%[[VAL_12]] : memref<100xf64>)
+// CHECK-DAG:        %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_3]]] : memref<?xindex>
+// CHECK-DAG:        %[[VAL_14:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_3]]) : (index, index) -> (index, index) {
 // CHECK:             %[[VAL_18:.*]] = arith.cmpi ult, %[[VAL_16]], %[[VAL_14]] : index
 // CHECK:             scf.condition(%[[VAL_18]]) %[[VAL_16]], %[[VAL_17]] : index, index
@@ -57,7 +57,7 @@
 // CHECK:           scf.for %[[VAL_31:.*]] = %[[VAL_32:.*]]#1 to %[[VAL_5]] step %[[VAL_2]] {
 // CHECK:             memref.store %[[VAL_7]], %[[VAL_12]]{{\[}}%[[VAL_31]]] : memref<100xf64>
 // CHECK:           }
-// CHECK:           %[[VAL_33:.*]] = bufferization.to_tensor %[[VAL_12]] : memref<100xf64>
+// CHECK:           %[[VAL_33:.*]] = bufferization.to_tensor %[[VAL_12]] :
 // CHECK:           return %[[VAL_33]] : tensor<100xf64>
 // CHECK:         }
 func.func @sparse_fusion(%argA: tensor<100xf64, #SV>) -> tensor<100xf64> {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir b/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir
index 8fa473b5a9dba5..3a33a200f8279d 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_int_ops.mlir
@@ -33,8 +33,8 @@
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}>
-// CHECK-DAG:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
-// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
+// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
@@ -94,8 +94,8 @@ func.func @add(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}>
-// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
-// CHECK-DAG:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
+// CHECK-DAG:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_15:.*]]:2 = scf.while (%[[VAL_16:.*]] = %[[VAL_13]], %[[VAL_17:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
@@ -154,8 +154,8 @@ func.func @sub(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}>
-// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
-// CHECK-DAG:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
+// CHECK-DAG:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] {
@@ -190,7 +190,7 @@ func.func @mul(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}>
-// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] {
@@ -224,7 +224,7 @@ func.func @divsbyc(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}>
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}>
-// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] {
@@ -258,8 +258,8 @@ func.func @divubyc(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
-// CHECK-DAG:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
+// CHECK-DAG:           %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_11:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_12:.*]] = %[[VAL_10]] to %[[VAL_11]] step %[[VAL_4]] {
@@ -296,8 +296,8 @@ func.func @and(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
-// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
+// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
@@ -356,8 +356,8 @@ func.func @or(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_9:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
-// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
+// CHECK-DAG:           %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]]:2 = scf.while (%[[VAL_15:.*]] = %[[VAL_12]], %[[VAL_16:.*]] = %[[VAL_4]]) : (index, index) -> (index, index) {
@@ -414,7 +414,7 @@ func.func @xor(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] {
@@ -448,7 +448,7 @@ func.func @ashrbyc(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] {
@@ -482,7 +482,7 @@ func.func @lsrbyc(%arga: tensor<32xi64, #SV>,
 // CHECK-DAG:           %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:           %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<32xi64>
+// CHECK-DAG:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<32xi64> to memref<32xi64>
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_9]] to %[[VAL_10]] step %[[VAL_4]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
index 78e29979ca1acb..d215ebb1c0c6f1 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_kernels.mlir
@@ -18,8 +18,8 @@
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10x20xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20x30xf32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<10x30xf32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<20x30xf32> to memref<20x30xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<10x30xf32> to memref<10x30xf32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_15:.*]] = %[[VAL_13]] to %[[VAL_14]] step %[[VAL_5]] {
@@ -58,13 +58,13 @@ func.func @matmul1(%a: tensor<10x20xf32, #DCSR>,
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 10 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<10x20xf32>
+// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<10x20xf32> to memref<10x20xf32>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index}
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index}
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index}
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index}
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]]
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<10x30xf32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<10x30xf32> to memref<10x30xf32>
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_14:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -203,13 +203,13 @@ func.func @matmul2(%A: tensor<4x8xf64, #DCSR>,
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 6 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<8x8xi32>
+// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<8x8xi32> to memref<8x8xi32>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x3xi32, #sparse{{[0-9]*}}> to memref<?xi32>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<6x6xi32>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<6x6xi32> to memref<6x6xi32>
 // CHECK:           scf.for %[[VAL_13:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:               %[[VAL_15:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_13]], %[[VAL_14]]] : memref<6x6xi32>
@@ -255,13 +255,13 @@ func.func @conv2d(%input:  tensor<8x8xi32>,
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 2 : i64
-// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : memref<5x3xi8>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<5x3xi8> to memref<5x3xi8>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 1 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 1 : index} : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<3x6xi8, #sparse{{[0-9]*}}> to memref<?xi8>
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<5x6xi64>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<5x6xi64> to memref<5x6xi64>
 // CHECK:           scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK:             %[[VAL_15:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:             %[[VAL_16:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -309,7 +309,7 @@ func.func @quantized_matmul(%input1: tensor<5x3xi8>,
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<1024xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 0 : index} : tensor<1024xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<1024xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<f32>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<f32> to memref<f32>
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_11]][] : memref<f32>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           %[[VAL_14:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
index f819458e038582..836e26b51f7c18 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_kernels_to_iterator.mlir
@@ -85,7 +85,7 @@ func.func @sqsum(%arg0: tensor<?x?x?x?xi32, #COO>) -> tensor<i32> {
 // CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 // CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i32
 // CHECK:           %[[VAL_5:.*]] = arith.constant dense<0> : tensor<10xi32>
-// CHECK:           %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_5]] : memref<10xi32>
+// CHECK:           %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_5]] : tensor<10xi32> to memref<10xi32>
 // CHECK:           linalg.fill ins(%[[VAL_4]] : i32) outs(%[[VAL_6]] : memref<10xi32>)
 // CHECK:           %[[VAL_7:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref<?xindex>
 // CHECK:           %[[VAL_8:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #sparse{{.*}}> to memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
index c27df00785522a..cab57389f032e1 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower.mlir
@@ -29,8 +29,8 @@
 // CHECK-HIR-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}>
 // CHECK-HIR-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}>
 // CHECK-HIR-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse{{[0-9]*}}>
-// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-HIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64>
+// CHECK-HIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK-HIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-HIR-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
 // CHECK-HIR-DAG:         %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_5]] : index
@@ -60,8 +60,8 @@
 // CHECK-MIR-DAG:       %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
 // CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
 // CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
-// CHECK-MIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-MIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-MIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64>
+// CHECK-MIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_14:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-MIR-DAG:         %[[VAL_15:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_14]]] : memref<?xindex>
 // CHECK-MIR-DAG:         %[[VAL_16:.*]] = arith.addi %[[VAL_14]], %[[VAL_5]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
index 9fbb9dd0a26d17..b998eeb0d39443 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_col.mlir
@@ -32,8 +32,8 @@
 // CHECK-HIR-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[DEMAP]] {level = 1 : index}
 // CHECK-HIR-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[DEMAP]] {level = 1 : index}
 // CHECK-HIR-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[DEMAP]]
-// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-HIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64>
+// CHECK-HIR-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK-HIR:           scf.for %[[VAL_12:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-HIR-DAG:         %[[VAL_13:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_12]]] : memref<64xf64>
 // CHECK-HIR-DAG:         %[[VAL_14:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_12]]] : memref<?xindex>
@@ -62,8 +62,8 @@
 // CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
 // CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_6]]) : (!llvm.ptr, index) -> memref<?xindex>
 // CHECK-MIR-DAG:       %[[VAL_9:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
-// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-MIR-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64>
+// CHECK-MIR-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_15:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK-MIR:             %[[VAL_16:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_15]]] : memref<64xf64>
 // CHECK-MIR:             %[[VAL_17:.*]] = memref.load %[[VAL_7]]{{\[}}%[[VAL_15]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
index 773c5677eea550..e1e1953779fa88 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_lower_inplace.mlir
@@ -29,8 +29,8 @@
 // CHECK-HIR-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}>
 // CHECK-HIR-DAG:       %[[VAL_7:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x64xf64, #sparse{{[0-9]*}}>
 // CHECK-HIR-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x64xf64, #sparse{{[0-9]*}}>
-// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-HIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-HIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64>
+// CHECK-HIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK-HIR:           scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-HIR-DAG:         %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref<?xindex>
 // CHECK-HIR-DAG:         %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_5]] : index
@@ -60,8 +60,8 @@
 // CHECK-MIR-DAG:       %[[VAL_6:.*]] = call @sparsePositions0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
 // CHECK-MIR-DAG:       %[[VAL_7:.*]] = call @sparseCoordinates0(%[[VAL_0]], %[[VAL_5]]) : (!llvm.ptr, index) -> memref<?xindex>
 // CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF64(%[[VAL_0]]) : (!llvm.ptr) -> memref<?xf64>
-// CHECK-MIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<64xf64>
-// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<32xf64>
+// CHECK-MIR-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<64xf64> to memref<64xf64>
+// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<32xf64> to memref<32xf64>
 // CHECK-MIR:           scf.for %[[VAL_11:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] {
 // CHECK-MIR-DAG:         %[[VAL_12:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_11]]] : memref<?xindex>
 // CHECK-MIR-DAG:         %[[VAL_13:.*]] = arith.addi %[[VAL_11]], %[[VAL_5]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_nd.mlir b/mlir/test/Dialect/SparseTensor/sparse_nd.mlir
index 2ac36fa6d8996b..b80a48363773f7 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_nd.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_nd.mlir
@@ -35,13 +35,13 @@
 // CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 80 : index
 // CHECK-DAG:       %[[VAL_11:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_12:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_0]] : memref<10x20x30x40x50x60x70x80xf32>
+// CHECK-DAG:       %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<10x20x30x40x50x60x70x80xf32> to memref<10x20x30x40x50x60x70x80xf32>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 3 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 3 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_16:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 4 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_17:.*]] = sparse_tensor.coordinates %[[VAL_1]] {level = 4 : index} : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_18:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<80x70x60x50x40x30x20x10xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_2]] : memref<10x20x30x40x50x60x70x80xf32>
+// CHECK-DAG:       %[[VAL_20:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<10x20x30x40x50x60x70x80xf32> to memref<10x20x30x40x50x60x70x80xf32>
 // CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_20]] : memref<10x20x30x40x50x60x70x80xf32>
 // CHECK:           scf.for %[[VAL_21:.*]] = %[[VAL_11]] to %[[VAL_10]] step %[[VAL_12]] {
 // CHECK:             %[[VAL_23:.*]] = arith.muli %[[VAL_21]], %[[VAL_9]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir b/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir
index 5b453e9a736a24..ab7a30e2f96a59 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_outbuf.mlir
@@ -19,7 +19,7 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xi32, #{{.*}}> to memref<?xi32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : memref<10xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<10xf32> to memref<10xf32>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_3]] : f32) outs(%[[VAL_8]] : memref<10xf32>)
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_4]]] : memref<?xindex>
@@ -53,7 +53,7 @@ func.func @allout_inplace(%arga: tensor<10xi32, #SV>,
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xi32, #{{.*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xi32, #{{.*}}> to memref<?xi32>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_4]] : memref<10xf32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_4]] : tensor<10xf32> to memref<10xf32>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_2]] : f32) outs(%[[VAL_8]] : memref<10xf32>)
 // CHECK:           %[[VAL_9:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_1]]] : memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.load %[[VAL_5]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -86,7 +86,7 @@ func.func @allout_materialize(%arga: tensor<10xi32, #SV>) -> tensor<10xf32> {
 // CHECK-DAG:       %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<10xf32, #{{.*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<10xf32, #{{.*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<10xf32, #{{.*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<10xf32>
+// CHECK-DAG:       %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<10xf32> to memref<10xf32>
 // CHECK-DAG:       %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_8]] to %[[VAL_9]] step %[[VAL_3]] {
diff --git a/mlir/test/Dialect/SparseTensor/sparse_pack.mlir b/mlir/test/Dialect/SparseTensor/sparse_pack.mlir
index a90194a74ee4a3..91e3842bdd367f 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_pack.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_pack.mlir
@@ -12,12 +12,12 @@
 // CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 2 : index
 // CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 100 : index
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : memref<2xindex>
+// CHECK-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<2xindex> to memref<2xindex>
 // CHECK-DAG:       %[[VAL_7:.*]] = memref.cast %[[VAL_6]] : memref<2xindex> to memref<?xindex>
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : memref<6x2xi32>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<6x2xi32> to memref<6x2xi32>
 // CHECK-DAG:       %[[VAL_9:.*]] = memref.collapse_shape %[[VAL_8]] {{\[\[}}0, 1]] : memref<6x2xi32> into memref<12xi32>
 // CHECK-DAG:       %[[VAL_10:.*]] = memref.cast %[[VAL_9]] : memref<12xi32> to memref<?xi32>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_0]] : memref<6xf64>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<6xf64> to memref<6xf64>
 // CHECK-DAG:       %[[VAL_12:.*]] = memref.cast %[[VAL_11]] : memref<6xf64> to memref<?xf64>
 // CHECK:           %[[VAL_13:.*]] = sparse_tensor.storage_specifier.init
 // CHECK:           %[[VAL_14:.*]] = sparse_tensor.storage_specifier.set %[[VAL_13]]  lvl_sz at 0 with %[[VAL_4]]
@@ -45,18 +45,18 @@ func.func @sparse_pack(%values: tensor<6xf64>, %pos:tensor<2xindex>, %coordinate
 // CHECK-SAME:      %[[VAL_5:.*]]: tensor<2xindex>,
 // CHECK-SAME:      %[[VAL_6:.*]]: tensor<6x2xi32>) -> (tensor<6xf64>, tensor<2xindex>, tensor<6x2xi32>) {
 // CHECK:           %[[VAL_7:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]  pos_mem_sz at 0
-// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_5]] : memref<2xindex>
+// CHECK:           %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_5]] : tensor<2xindex> to memref<2xindex>
 // CHECK:           %[[VAL_9:.*]] = memref.subview %[[VAL_8]][0] {{\[}}%[[VAL_7]]] [1] : memref<2xindex> to memref<?xindex>
 // CHECK:           %[[VAL_10:.*]] = memref.subview %[[VAL_0]][0] {{\[}}%[[VAL_7]]] [1] : memref<?xindex> to memref<?xindex>
 // CHECK:           memref.copy %[[VAL_10]], %[[VAL_9]] : memref<?xindex> to memref<?xindex>
 // CHECK:           %[[VAL_11:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]  crd_mem_sz at 0
-// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_6]] : memref<6x2xi32>
+// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_6]] : tensor<6x2xi32> to memref<6x2xi32>
 // CHECK:           %[[VAL_13:.*]] = memref.collapse_shape %[[VAL_12]] {{\[\[}}0, 1]] : memref<6x2xi32> into memref<12xi32>
 // CHECK:           %[[VAL_14:.*]] = memref.subview %[[VAL_13]][0] {{\[}}%[[VAL_11]]] [1] : memref<12xi32> to memref<?xi32>
 // CHECK:           %[[VAL_15:.*]] = memref.subview %[[VAL_1]][0] {{\[}}%[[VAL_11]]] [1] : memref<?xi32> to memref<?xi32>
 // CHECK:           memref.copy %[[VAL_15]], %[[VAL_14]] : memref<?xi32> to memref<?xi32>
 // CHECK:           %[[VAL_16:.*]] = sparse_tensor.storage_specifier.get %[[VAL_3]]  val_mem_sz
-// CHECK:           %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_4]] : memref<6xf64>
+// CHECK:           %[[VAL_17:.*]] = bufferization.to_memref %[[VAL_4]] : tensor<6xf64> to memref<6xf64>
 // CHECK:           %[[VAL_18:.*]] = memref.subview %[[VAL_17]][0] {{\[}}%[[VAL_16]]] [1] : memref<6xf64> to memref<?xf64>
 // CHECK:           %[[VAL_19:.*]] = memref.subview %[[VAL_2]][0] {{\[}}%[[VAL_16]]] [1] : memref<?xf64> to memref<?xf64>
 // CHECK:           memref.copy %[[VAL_19]], %[[VAL_18]] : memref<?xf64> to memref<?xf64>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
index 44a551464c8609..c2cabd4351112b 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_parallel_reduce.mlir
@@ -24,8 +24,8 @@
 //   CHECK-DAG:  %[[TMP_0:.*]] = sparse_tensor.positions %[[TMP_arg0]] {level = 1 : index}
 //   CHECK-DAG:  %[[TMP_1:.*]] = sparse_tensor.coordinates %[[TMP_arg0]] {level = 1 : index}
 //   CHECK-DAG:  %[[TMP_2:.*]] = sparse_tensor.values %[[TMP_arg0]]
-//   CHECK-DAG:  %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg1]] : memref<32xf32>
-//   CHECK-DAG:  %[[TMP_4:.*]] = bufferization.to_memref %[[TMP_arg2]] : memref<16xf32>
+//   CHECK-DAG:  %[[TMP_3:.*]] = bufferization.to_memref %[[TMP_arg1]] : tensor<32xf32> to memref<32xf32>
+//   CHECK-DAG:  %[[TMP_4:.*]] = bufferization.to_memref %[[TMP_arg2]] : tensor<16xf32> to memref<16xf32>
 //       CHECK:  scf.parallel (%[[TMP_arg3:.*]]) = (%[[TMP_c0]]) to (%[[TMP_c16]]) step (%[[TMP_c1]]) {
 //       CHECK:    %[[TMP_6:.*]] = memref.load %[[TMP_4]][%[[TMP_arg3]]] : memref<16xf32>
 //       CHECK:    %[[TMP_7:.*]] = memref.load %[[TMP_0]][%[[TMP_arg3]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
index 07c273fcddc3bf..5f8002b5b6d31f 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm.mlir
@@ -24,7 +24,7 @@
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1 : index
 // CHECK:           %[[DEMAP:.*]] = sparse_tensor.reinterpret_map %[[VAL_0]]
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[DEMAP]] : tensor<30x10x20xf32, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<20x30x10xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<20x30x10xf32> to memref<20x30x10xf32>
 // CHECK:           linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_9]] : memref<20x30x10xf32>)
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_5]] to %[[VAL_3]] step %[[VAL_6]] {
 // CHECK:             %[[VAL_12:.*]] = arith.muli %[[VAL_10]], %[[VAL_4]] : index
@@ -64,7 +64,7 @@ func.func @sparse_static_dims(%arga: tensor<10x20x30xf32, #X>,
 // CHECK-DAG:       %[[VAL_6:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_2]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_3]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?x?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?x?x?xf32> to memref<?x?x?xf32>
 // CHECK-DAG:       linalg.fill ins(%[[ZERO]] : f32) outs(%[[VAL_10]] : memref<?x?x?xf32>)
 // CHECK:           scf.for %[[VAL_11:.*]] = %[[VAL_3]] to %[[VAL_7]] step %[[VAL_4]] {
 // CHECK:             %[[VAL_13:.*]] = arith.muli %[[VAL_11]], %[[VAL_8]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
index 9bf10345f4ea55..93b5da41fc7f98 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_perm_lower.mlir
@@ -26,7 +26,7 @@
 // CHECK-HIR-DAG:       %[[VAL_6:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_2]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
 // CHECK-HIR-DAG:       %[[VAL_7:.*]] = sparse_tensor.lvl %[[DEMAP]], %[[VAL_4]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
 // CHECK-HIR-DAG:       %[[VAL_8:.*]] = sparse_tensor.values %[[DEMAP]] : tensor<?x?x?xf32, #sparse{{[0-9]*}}>
-// CHECK-HIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
+// CHECK-HIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<f32> to memref<f32>
 // CHECK-HIR:           %[[VAL_11:.*]] = tensor.extract %[[VAL_1]][] : tensor<f32>
 // CHECK-HIR:           %[[VAL_12:.*]] = scf.for %[[VAL_13:.*]] = %[[VAL_3]] to %[[VAL_5]] step %[[VAL_2]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) {
 // CHECK-HIR:             %[[VAL_18:.*]] = arith.muli %[[VAL_13]], %[[VAL_6]] : index
@@ -58,7 +58,7 @@
 // CHECK-MIR-DAG:       %[[DimSize1:.*]] = call @sparseLvlSize(%[[ARGA]], %[[I1]])
 // CHECK-MIR-DAG:       %[[DimSize2:.*]] = call @sparseLvlSize(%[[ARGA]], %[[I2]])
 // CHECK-MIR-DAG:       %[[VAL_8:.*]] = call @sparseValuesF32(%[[ARGA]]) : (!llvm.ptr) -> memref<?xf32>
-// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[ARGX]] : memref<f32>
+// CHECK-MIR-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[ARGX]] : tensor<f32> to memref<f32>
 // CHECK-MIR:           %[[VAL_11:.*]] = tensor.extract %[[ARGX]][] : tensor<f32>
 // CHECK-MIR:           %[[VAL_12:.*]] = scf.for %[[D2:.*]] = %[[I0]] to %[[DimSize0]] step %[[I1]] iter_args(%[[VAL_14:.*]] = %[[VAL_11]]) -> (f32) {
 // CHECK-MIR:             %[[VAL_18:.*]] = arith.muli %[[D2]], %[[DimSize1]] : index
diff --git a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
index 666882edcbab38..e5df646851d43e 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_scalars.mlir
@@ -33,8 +33,8 @@
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<32x16xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : memref<f32>
-// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : memref<32x16xf32>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<f32> to memref<f32>
+// CHECK-DAG:       %[[VAL_15:.*]] = bufferization.to_memref %[[VAL_4]] : tensor<32x16xf32> to memref<32x16xf32>
 // CHECK-DAG:       %[[VAL_16:.*]] = memref.load %[[VAL_14]][] : memref<f32>
 // CHECK-DAG:       %[[VAL_17:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_6]]] : memref<?xindex>
 // CHECK-DAG:       %[[VAL_18:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_7]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir b/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir
index a03b97684a7a4a..e769534641ec85 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_sddmm.mlir
@@ -64,14 +64,14 @@ func.func @fold_yield_direct_zero() -> tensor<32xf64> {
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<0.000000e+00> : tensor<8x8xf64>
 // CHECK-DAG:       %[[VAL_7:.*]] = bufferization.alloc_tensor() copy(%[[VAL_6]]) : tensor<8x8xf64>
 // CHECK-DAG:       %[[VAL_8:.*]] = bufferization.alloc_tensor() copy(%[[VAL_6]]) : tensor<8x8xf64>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_8]] : memref<8x8xf64>
+// CHECK-DAG:       %[[VAL_16:.*]] = bufferization.to_memref %[[VAL_8]] : tensor<8x8xf64> to memref<8x8xf64>
 // CHECK:           %[[VAL_17:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_18:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           scf.for %[[VAL_19:.*]] = %[[VAL_17]] to %[[VAL_18]] step %[[VAL_5]] {
@@ -132,8 +132,8 @@ func.func @sampled_dd_unfused(%args: tensor<8x8xf64, #SM>,
 // CHECK-DAG:       %[[VAL_8:.*]] = arith.constant dense<0.000000e+00> : tensor<8x8xf64>
 // CHECK-DAG:       %[[VAL_9:.*]] = bufferization.alloc_tensor() copy(%[[VAL_8]]) : tensor<8x8xf64>
 // CHECK-DAG:       %[[VAL_10:.*]] = tensor.empty() : tensor<8x8xf64, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>
-// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64>
+// CHECK-DAG:       %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_14:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_15:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir b/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir
index a66028c61b22f4..3cc0aa26c8bc29 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_sddmm_org.mlir
@@ -30,8 +30,8 @@
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant false
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant true
 // CHECK-DAG:       %[[VAL_8:.*]] = tensor.empty() : tensor<8x8xf64, #sparse{{[0-9]*}}>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<8x8xf64> to memref<8x8xf64>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
index 281e7858ce25ec..c99d5d25f7b4a7 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_chain.mlir
@@ -31,7 +31,7 @@
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 1 : index} : tensor<64x32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 1 : index} : tensor<64x32xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<64x32xf64, #sparse{{[0-9]*}}> to memref<?xf64>
-// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f64>
+// CHECK-DAG:       %[[VAL_14:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<f64> to memref<f64>
 // CHECK:           %[[VAL_15:.*]] = memref.load %[[VAL_14]][] : memref<f64>
 // CHECK:           %[[VAL_16:.*]] = scf.for %[[VAL_17:.*]] = %[[VAL_6]] to %[[VAL_5]] step %[[VAL_7]] iter_args(%[[VAL_18:.*]] = %[[VAL_15]]) -> (f64) {
 // CHECK:             %[[VAL_19:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_17]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir
index ac357d9dc24855..d88372276989df 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_vector_index.mlir
@@ -28,7 +28,7 @@
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : memref<8xi64>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : tensor<8xi64> to memref<8xi64>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_4]] : i64) outs(%[[VAL_11]] : memref<8xi64>)
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_6]]] : memref<?xindex>
@@ -70,7 +70,7 @@ func.func @sparse_index_1d_conj(%arga: tensor<8xi64, #SparseVector>) -> tensor<8
 // CHECK-DAG:       %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<8xi64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8xi64, #sparse{{[0-9]*}}> to memref<?xi64>
-// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : memref<8xi64>
+// CHECK-DAG:       %[[VAL_11:.*]] = bufferization.to_memref %[[VAL_7]] : tensor<8xi64> to memref<8xi64>
 // CHECK-DAG:       linalg.fill ins(%[[VAL_3]] : i64) outs(%[[VAL_11]] : memref<8xi64>)
 // CHECK:           %[[VAL_12:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK:           %[[VAL_13:.*]] = memref.load %[[VAL_8]]{{\[}}%[[VAL_5]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir b/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir
index 287b62ef44c65f..6c3acf43f241e1 100644
--- a/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir
+++ b/mlir/test/Dialect/SparseTensor/spy_sddmm.mlir
@@ -24,8 +24,8 @@
 // CHECK-DAG:     %[[VAL_3:.*]] = arith.constant 8 : index
 // CHECK-DAG:     %[[VAL_4:.*]] = arith.constant 0 : index
 // CHECK-DAG:     %[[VAL_5:.*]] = arith.constant 1 : index
-// CHECK-DAG:     %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<8x8xf64>
-// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>
+// CHECK-DAG:     %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<8x8xf64> to memref<8x8xf64>
+// CHECK-DAG:     %[[VAL_7:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<8x8xf64> to memref<8x8xf64>
 // CHECK-DAG:     %[[VAL_8:.*]] = sparse_tensor.positions %[[VAL_2]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_9:.*]] = sparse_tensor.coordinates %[[VAL_2]] {level = 1 : index} : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-DAG:     %[[VAL_10:.*]] = sparse_tensor.values %[[VAL_2]] : tensor<8x8xf64, #sparse{{[0-9]*}}> to memref<?xf64>
diff --git a/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir b/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir
index 10a7ac5802ec98..df1e564c06231a 100755
--- a/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir
+++ b/mlir/test/Dialect/SparseTensor/spy_sddmm_bsr.mlir
@@ -37,8 +37,8 @@
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-DAG:       %[[VAL_7:.*]] = sparse_tensor.reinterpret_map %[[VAL_0]] : tensor<?x?xf32, #[[$BSR]]> to tensor<?x?x2x2xf32, #[[$MAP]]>
 // CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_3]] : tensor<?x?xf32>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf32>
-// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_1]] : tensor<?x?xf32> to memref<?x?xf32>
+// CHECK-DAG:       %[[VAL_10:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<?x?xf32> to memref<?x?xf32>
 // CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.lvl %[[VAL_7]], %[[VAL_4]] : tensor<?x?x2x2xf32, #[[$MAP]]>
 // CHECK-DAG:       %[[VAL_12:.*]] = sparse_tensor.positions %[[VAL_7]] {level = 1 : index} : tensor<?x?x2x2xf32, #[[$MAP]]> to memref<?xindex>
 // CHECK-DAG:       %[[VAL_13:.*]] = sparse_tensor.coordinates %[[VAL_7]] {level = 1 : index} : tensor<?x?x2x2xf32, #[[$MAP]]> to memref<?xindex>
diff --git a/mlir/test/Dialect/SparseTensor/unused-tensor.mlir b/mlir/test/Dialect/SparseTensor/unused-tensor.mlir
index f85acb9c6969a8..7e8b9f83fac795 100644
--- a/mlir/test/Dialect/SparseTensor/unused-tensor.mlir
+++ b/mlir/test/Dialect/SparseTensor/unused-tensor.mlir
@@ -28,8 +28,8 @@
 // CHECK-DAG:       %[[VAL_5:.*]] = arith.constant 4 : index
 // CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 0 : index
 // CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<2x4xf64>
-// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : memref<2x4xf64>
+// CHECK-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<2x4xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = bufferization.to_memref %[[VAL_2]] : tensor<2x4xf64>
 // CHECK:           scf.for %[[VAL_10:.*]] = %[[VAL_6]] to %[[VAL_4]] step %[[VAL_7]] {
 // CHECK:             scf.for %[[VAL_11:.*]] = %[[VAL_6]] to %[[VAL_3]] step %[[VAL_7]] {
 // CHECK:               scf.for %[[VAL_12:.*]] = %[[VAL_6]] to %[[VAL_5]] step %[[VAL_7]] {
diff --git a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
index 578e86a793f906..15228c6a5f79ad 100644
--- a/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
+++ b/mlir/test/Dialect/SparseTensor/vectorize_reduction.mlir
@@ -16,7 +16,7 @@
 // CHECK-ON-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-ON-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-ON-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xi13>
-// CHECK-ON-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i13>
+// CHECK-ON-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i13> to memref<i13>
 // CHECK-ON:           %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i13>
 // CHECK-ON:           %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -42,7 +42,7 @@
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF-DAG:       %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-OFF-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xi13>
-// CHECK-OFF-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i13>
+// CHECK-OFF-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i13> to memref<i13>
 // CHECK-OFF:           %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i13>
 // CHECK-OFF:           %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-OFF:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -93,7 +93,7 @@ func.func @sparse_reduction_ori(%argx: tensor<i13>,
 // CHECK-ON-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-ON-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-ON-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xi13>
-// CHECK-ON-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i13>
+// CHECK-ON-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i13> to memref<i13>
 // CHECK-ON:           %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i13>
 // CHECK-ON:           %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -119,7 +119,7 @@ func.func @sparse_reduction_ori(%argx: tensor<i13>,
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF-DAG:       %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-OFF-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi13, #sparse{{[0-9]*}}> to memref<?xi13>
-// CHECK-OFF-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i13>
+// CHECK-OFF-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i13> to memref<i13>
 // CHECK-OFF:           %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i13>
 // CHECK-OFF:           %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-OFF:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -168,7 +168,7 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
 // CHECK-ON-DAG:       %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-ON-DAG:       %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-ON-DAG:       %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xi32>
-// CHECK-ON-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
+// CHECK-ON-DAG:       %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i32> to memref<i32>
 // CHECK-ON:           %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:           %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_3]]] : memref<?xindex>
 // CHECK-ON:           %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -194,7 +194,7 @@ func.func @sparse_reduction_ori_accumulator_on_rhs(%argx: tensor<i13>,
 // CHECK-OFF-DAG:       %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF-DAG:       %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-OFF-DAG:       %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xi32>
-// CHECK-OFF-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
+// CHECK-OFF-DAG:       %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i32> to memref<i32>
 // CHECK-OFF:           %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i32>
 // CHECK-OFF:           %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-OFF:           %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -243,7 +243,7 @@ func.func @sparse_reduction_subi(%argx: tensor<i32>,
 // CHECK-ON-DAG:  %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-ON-DAG:  %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-ON-DAG:  %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xi32>
-// CHECK-ON-DAG:  %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
+// CHECK-ON-DAG:  %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i32> to memref<i32>
 // CHECK-ON:  %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:  %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:  %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -269,7 +269,7 @@ func.func @sparse_reduction_subi(%argx: tensor<i32>,
 // CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF-DAG:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-OFF-DAG:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xi32>
-// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
+// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i32> to memref<i32>
 // CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i32>
 // CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -319,7 +319,7 @@ func.func @sparse_reduction_xor(%argx: tensor<i32>,
 // CHECK-ON-DAG:   %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-ON-DAG:   %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-ON-DAG:   %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xi32>
-// CHECK-ON-DAG:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
+// CHECK-ON-DAG:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i32> to memref<i32>
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<i32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -345,7 +345,7 @@ func.func @sparse_reduction_xor(%argx: tensor<i32>,
 // CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF-DAG:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-OFF-DAG:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xi32, #sparse{{[0-9]*}}> to memref<?xi32>
-// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<i32>
+// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<i32> to memref<i32>
 // CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<i32>
 // CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -395,7 +395,7 @@ func.func @sparse_reduction_addi(%argx: tensor<i32>,
 // CHECK-ON-DAG:   %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-ON-DAG:   %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-ON-DAG:   %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-ON-DAG:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f32>
+// CHECK-ON-DAG:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<f32> to memref<f32>
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -421,7 +421,7 @@ func.func @sparse_reduction_addi(%argx: tensor<i32>,
 // CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF-DAG:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-OFF-DAG:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f32>
+// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<f32> to memref<f32>
 // CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<f32>
 // CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
@@ -471,7 +471,7 @@ func.func @sparse_reduction_subf(%argx: tensor<f32>,
 // CHECK-ON-DAG:   %[[VAL_5:.*]] = arith.constant 1 : index
 // CHECK-ON-DAG:   %[[VAL_6:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-ON-DAG:   %[[VAL_7:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-ON-DAG:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f32>
+// CHECK-ON-DAG:   %[[VAL_8:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<f32> to memref<f32>
 // CHECK-ON:   %[[VAL_9:.*]] = memref.load %[[VAL_8]][] : memref<f32>
 // CHECK-ON:   %[[VAL_10:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_4]]] : memref<?xindex>
 // CHECK-ON:   %[[VAL_11:.*]] = memref.load %[[VAL_6]]{{\[}}%[[VAL_5]]] : memref<?xindex>
@@ -497,7 +497,7 @@ func.func @sparse_reduction_subf(%argx: tensor<f32>,
 // CHECK-OFF-DAG:   %[[VAL_3:.*]] = arith.constant 1 : index
 // CHECK-OFF-DAG:   %[[VAL_4:.*]] = sparse_tensor.positions %[[VAL_1]] {level = 0 : index} : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
 // CHECK-OFF-DAG:   %[[VAL_5:.*]] = sparse_tensor.values %[[VAL_1]] : tensor<?xf32, #sparse{{[0-9]*}}> to memref<?xf32>
-// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : memref<f32>
+// CHECK-OFF-DAG:   %[[VAL_6:.*]] = bufferization.to_memref %[[VAL_0]] : tensor<f32> to memref<f32>
 // CHECK-OFF:   %[[VAL_7:.*]] = memref.load %[[VAL_6]][] : memref<f32>
 // CHECK-OFF:   %[[VAL_8:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_2]]] : memref<?xindex>
 // CHECK-OFF:   %[[VAL_9:.*]] = memref.load %[[VAL_4]]{{\[}}%[[VAL_3]]] : memref<?xindex>
diff --git a/mlir/test/Dialect/Tensor/bufferize.mlir b/mlir/test/Dialect/Tensor/bufferize.mlir
index 3a3c8af15e6e41..ecd285be461947 100644
--- a/mlir/test/Dialect/Tensor/bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/bufferize.mlir
@@ -3,7 +3,7 @@
 // CHECK-LABEL:   func @dim(
 // CHECK-SAME:              %[[TENSOR:.*]]: tensor<*xf32>,
 // CHECK-SAME:              %[[INDEX:.*]]: index) -> index {
-// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<*xf32>
+// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<*xf32> to memref<*xf32>
 // CHECK:           %[[EXTENT:.*]] = memref.dim %[[MEMREF]], %[[INDEX]] : memref<*xf32>
 // CHECK:           return %[[EXTENT]] : index
 func.func @dim(%arg0: tensor<*xf32>, %arg1: index) -> index {
@@ -39,7 +39,7 @@ func.func @tensor.cast(%arg0: tensor<?xindex>) -> tensor<2xindex> {
 
 // CHECK-LABEL:   func @tensor.cast_from_unranked(
 // CHECK-SAME:                                    %[[TENSOR:.*]]: tensor<*xf32>) -> tensor<2xf32> {
-// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<*xf32>
+// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<*xf32> to memref<*xf32>
 // CHECK:           %[[CASTED_MEMREF:.*]] = memref.cast %[[MEMREF]] : memref<*xf32> to memref<2xf32, strided<[?], offset: ?>>
 // CHECK:           %[[RET:.*]] = bufferization.to_tensor %[[CASTED_MEMREF]] : memref<2xf32, strided<[?], offset: ?>>
 // CHECK:           return %[[RET]] : tensor<2xf32>
@@ -52,7 +52,7 @@ func.func @tensor.cast_from_unranked(%arg0: tensor<*xf32>) -> tensor<2xf32> {
 
 // CHECK-LABEL:   func @tensor.cast_to_unranked(
 // CHECK-SAME:                                  %[[TENSOR:.*]]: tensor<2xf32>) -> tensor<*xf32> {
-// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<2xf32>
+// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<2xf32> to memref<2xf32>
 // CHECK:           %[[CASTED_MEMREF:.*]] = memref.cast %[[MEMREF]] : memref<2xf32> to memref<*xf32>
 // CHECK:           %[[RET:.*]] = bufferization.to_tensor %[[CASTED_MEMREF]] : memref<*xf32>
 // CHECK:           return %[[RET]] : tensor<*xf32>
@@ -77,7 +77,7 @@ func.func @tensor.empty() -> tensor<5xf32> {
 // CHECK-LABEL:   func @tensor.extract(
 // CHECK-SAME:                  %[[TENSOR:.*]]: tensor<?xf32>,
 // CHECK-SAME:                  %[[IDX:.*]]: index) -> f32 {
-// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : memref<?xf32>
+// CHECK:           %[[MEMREF:.*]] = bufferization.to_memref %[[TENSOR]] : tensor<?xf32> to memref<?xf32>
 // CHECK:           %[[RET:.*]] = memref.load %[[MEMREF]][%[[IDX]]] : memref<?xf32>
 // CHECK:           return %[[RET]] : f32
 // CHECK:         }
@@ -199,7 +199,7 @@ func.func @tensor.from_elements_3d(%f0 : f32) -> tensor<3x2x2xf32> {
 // CHECK-LABEL:   func @tensor.generate(
 // CHECK-SAME:        %[[ARG:.*]]: tensor<*xf32>,
 // CHECK-SAME:        %[[DYNAMIC_EXTENT:.*]]: index) -> tensor<?xindex> {
-// CHECK-DAG:       %[[ARG_M:.*]] = bufferization.to_memref %[[ARG]] : memref<*xf32>
+// CHECK-DAG:       %[[ARG_M:.*]] = bufferization.to_memref %[[ARG]] : tensor<*xf32> to memref<*xf32>
 // CHECK-DAG:       %[[ALLOC:.*]] = memref.alloc(%[[DYNAMIC_EXTENT]]) {{.*}} : memref<?xindex>
 // CHECK:           %[[ALLOC_T:.*]] = bufferization.to_tensor %[[ALLOC]]
 // CHECK:           %[[MAPPED:.*]] = linalg.map
@@ -266,7 +266,7 @@ func.func @tensor.generate_unknown_ops_in_body(%arg0: index) -> tensor<?xindex>
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x?xf32>, %[[idx1:.*]]: index, %[[idx2:.*]]: index
 func.func @tensor.extract_slice(
     %t1: tensor<?x?xf32>, %idx1: index, %idx2: index) -> tensor<?x10xf32> {
-  // CHECK: %[[m:.*]] = bufferization.to_memref %[[t1]] : memref<?x?xf32>
+  // CHECK: %[[m:.*]] = bufferization.to_memref %[[t1]] : tensor<?x?xf32> to memref<?x?xf32>
   // CHECK: %[[r:.*]] = memref.subview %[[m]][5, %[[idx2]]] [%[[idx1]], 10] [1, 1] : memref<?x?xf32> to memref<?x10xf32, strided<[?, 1], offset: ?>>
   %0 = tensor.extract_slice %t1[5, %idx2][%idx1, 10][1, 1]
       : tensor<?x?xf32> to tensor<?x10xf32>
@@ -282,7 +282,7 @@ func.func @tensor.extract_slice(
 //  CHECK-SAME:     %[[idx2:.*]]: index
 func.func @tensor.extract_slice_rank_reducing(
     %t1: tensor<?x10x?xf32>, %idx1: index, %idx2: index) -> tensor<?x15xf32> {
-  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x10x?xf32>
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<?x10x?xf32> to memref<?x10x?xf32>
   // CHECK: %[[r:.*]] = memref.subview %[[m1]][5, %[[idx1]], 10] [%[[idx2]], 1, 15] [1, 1, 1] : memref<?x10x?xf32> to memref<?x15xf32, strided<[?, 1], offset: ?>>
   %0 = tensor.extract_slice %t1[5, %idx1, 10][%idx2, 1, 15][1, 1, 1]
       : tensor<?x10x?xf32> to tensor<?x15xf32>
@@ -300,8 +300,8 @@ func.func @tensor.insert_slice(%t1: tensor<?x?xf32>, %t2: tensor<?x10xf32>,
                                %idx1: index, %idx2: index) -> tensor<?x?xf32> {
   // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
-  // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x?xf32>
-  // CHECK-DAG: %[[m2:.*]] = bufferization.to_memref %[[t2]] : memref<?x10xf32>
+  // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<?x?xf32> to memref<?x?xf32>
+  // CHECK-DAG: %[[m2:.*]] = bufferization.to_memref %[[t2]] : tensor<?x10xf32> to memref<?x10xf32>
   // CHECK-DAG: %[[dim0:.*]] = memref.dim %[[m1]], %[[c0]]
   // CHECK-DAG: %[[dim1:.*]] = memref.dim %[[m1]], %[[c1]]
   //     CHECK: %[[alloc:.*]] = memref.alloc(%[[dim0]], %[[dim1]])
@@ -353,7 +353,7 @@ func.func @tensor.insert_slice_rank_reducing_2(
 //  CHECK-SAME:     %[[f:.*]]: f32
 func.func @tensor.insert(%t1: tensor<5xf32>, %idx1: index, %f: f32) -> tensor<5xf32> {
   // CHECK-DAG: %[[alloc:.*]] = memref.alloc() {{.*}} : memref<5xf32>
-  // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<5xf32>
+  // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<5xf32> to memref<5xf32>
   // CHECK: memref.copy %[[m1]], %[[alloc]]
   // CHECK: memref.store %[[f]], %[[alloc]][%[[idx1]]]
   %0 = tensor.insert %f into %t1[%idx1] : tensor<5xf32>
@@ -368,7 +368,7 @@ func.func @tensor.insert(%t1: tensor<5xf32>, %idx1: index, %f: f32) -> tensor<5x
 // CHECK-LABEL: func @tensor.expand_shape(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x10xf32>
 func.func @tensor.expand_shape(%t1: tensor<?x10xf32>, %sz0: index) -> tensor<2x?x10xf32> {
-  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x10xf32>
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]]
   // CHECK: %[[C0:.*]] = arith.constant 0 : index
   // CHECK: %[[DIM:.*]] = memref.dim %[[m1]], %[[C0]] : memref<?x10xf32>
   // CHECK: %[[C2:.*]] = arith.constant 2 : index
@@ -388,7 +388,7 @@ func.func @tensor.expand_shape(%t1: tensor<?x10xf32>, %sz0: index) -> tensor<2x?
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x20xf32>
 func.func @tensor.expand_shape_of_slice(
     %t1: tensor<?x20xf32>, %o1: index, %s1: index, %sz0: index) -> tensor<?x7x2x5xf32> {
-  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x20xf32>
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] :
   // CHECK: %[[subview:.*]] = memref.subview %[[m1]][%{{.*}}, 5] [%{{.*}}, 10] [1, 1] : memref<?x20xf32> to memref<?x10xf32, strided<[20, 1], offset: ?>>
   %0 = tensor.extract_slice %t1[%o1, 5][%s1, 10][1, 1] :
       tensor<?x20xf32> to tensor<?x10xf32>
@@ -408,7 +408,7 @@ func.func @tensor.expand_shape_of_slice(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?xf32>
 func.func @tensor.expand_shape_of_scalar_slice(
     %t1: tensor<?xf32>, %o1: index, %s1: index) -> tensor<1xf32> {
-  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?xf32>
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<?xf32> to memref<?xf32>
   // CHECK: %[[subview:.*]] = memref.subview %[[m1]][%{{.*}}] [1] [1] :  memref<?xf32> to memref<f32, strided<[], offset: ?>>
   %0 = tensor.extract_slice %t1[%o1][1][1] : tensor<?xf32> to tensor<f32>
   // CHECK: %[[expanded:.*]] = memref.expand_shape %[[subview]] [] output_shape [1] : memref<f32, strided{{.*}}> into memref<1xf32, strided<[1], offset: ?>>
@@ -423,7 +423,7 @@ func.func @tensor.expand_shape_of_scalar_slice(
 // CHECK-LABEL: func @tensor.collapse_shape(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<2x?x?xf32>
 func.func @tensor.collapse_shape(%t1: tensor<2x?x?xf32>) -> tensor<?x?xf32> {
-  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<2x?x?xf32>
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<2x?x?xf32> to memref<2x?x?xf32>
   // CHECK: %[[collapsed:.*]] = memref.collapse_shape %[[m1]] [
   // CHECK-SAME: [0, 1], [2]] : memref<2x?x?xf32> into memref<?x?xf32>
   %0 = tensor.collapse_shape %t1 [[0, 1], [2]]
@@ -439,7 +439,7 @@ func.func @tensor.collapse_shape(%t1: tensor<2x?x?xf32>) -> tensor<?x?xf32> {
 // CHECK-LABEL: func @tensor.collapse_shape_to_scalar(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<1x1x1xf32>
 func.func @tensor.collapse_shape_to_scalar(%t1: tensor<1x1x1xf32>) -> tensor<f32> {
-  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<1x1x1xf32>
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<1x1x1xf32> to memref<1x1x1xf32>
   // CHECK: %[[collapsed:.*]] = memref.collapse_shape %[[m1]] [] : memref<1x1x1xf32> into memref<f32>
   %0 = tensor.collapse_shape %t1 []
       : tensor<1x1x1xf32> into tensor<f32>
@@ -528,7 +528,7 @@ func.func @tensor.collapse_shape_of_slice5(%arg0: tensor<2x2x2xi64>) -> tensor<4
 // CHECK-LABEL: func @tensor.reshape(
 //  CHECK-SAME:     %[[t1:.*]]: tensor<?x10xf32>
 func.func @tensor.reshape(%t1: tensor<?x10xf32>) -> tensor<2x2x5xf32> {
-  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x10xf32>
+  // CHECK: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<?x10xf32> to memref<?x10xf32>
 
   // CHECK: %[[two:.*]] = arith.constant 2 : i64
   %two = arith.constant 2 : i64
@@ -560,7 +560,7 @@ func.func @tensor.reshape(%t1: tensor<?x10xf32>) -> tensor<2x2x5xf32> {
 //  CHECK-SAME:   %[[t1:.*]]: tensor<?x10xindex>, %[[l2:.*]]: index, %[[h1:.*]]: index, %[[h2:.*]]: index
 func.func @tensor.pad(%t1: tensor<?x10xindex>, %l2: index, %h1: index,
                       %h2: index) -> tensor<?x?xindex> {
-  // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : memref<?x10xindex>
+  // CHECK-DAG: %[[m1:.*]] = bufferization.to_memref %[[t1]] : tensor<?x10xindex> to memref<?x10xindex>
   // CHECK-DAG: %[[c0:.*]] = arith.constant 0 : index
   // CHECK-DAG: %[[c1:.*]] = arith.constant 1 : index
   // CHECK-DAG: %[[dim0:.*]] = memref.dim %[[m1]], %[[c0]]
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize-encodings.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize-encodings.mlir
new file mode 100644
index 00000000000000..7398fdf614e1a7
--- /dev/null
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize-encodings.mlir
@@ -0,0 +1,20 @@
+// RUN: mlir-opt %s -one-shot-bufferize="use-encoding-for-memory-space" -split-input-file | FileCheck %s
+
+func.func @from_elements(%fill: f32, %f: f32, %idx: index) -> tensor<3xf32, 1> {
+  %t = tensor.from_elements %fill, %fill, %fill : tensor<3xf32, 1>
+  %i = tensor.insert %f into %t[%idx] : tensor<3xf32, 1>
+  return %i : tensor<3xf32, 1>
+}
+
+// CHECK-LABEL: @from_elements
+//  CHECK-SAME: (%[[arg0:.+]]: f32, %[[arg1:.+]]: f32, %[[arg2:.+]]: index) -> tensor<3xf32, 1 : i64>
+//       CHECK:     %[[alloc:.+]] = memref.alloc() {{.*}} : memref<3xf32, 1>
+//       CHECK-DAG:     %[[c0:.+]] = arith.constant 0 : index
+//       CHECK-DAG:     %[[c1:.+]] = arith.constant 1 : index
+//       CHECK-DAG:     %[[c2:.+]] = arith.constant 2 : index
+//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c0]]] : memref<3xf32, 1>
+//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c1]]] : memref<3xf32, 1>
+//       CHECK:     memref.store %[[arg0]], %[[alloc]][%[[c2]]] : memref<3xf32, 1>
+//       CHECK:     memref.store %[[arg1]], %[[alloc]][%[[arg2]]] : memref<3xf32, 1>
+//       CHECK:     %[[v0:.+]] = bufferization.to_tensor %[[alloc]] : memref<3xf32, 1> to tensor<3xf32, 1 : i64>
+//       CHECK:     return %[[v0]] : tensor<3xf32, 1 : i64>
diff --git a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
index dc4306b8316ab7..af4f84640890bb 100644
--- a/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Tensor/one-shot-bufferize.mlir
@@ -402,7 +402,7 @@ func.func @tensor.reshape() -> tensor<2x2x5xf32> {
 // CHECK-SAME:    %[[INPUT:[a-zA-Z0-9]*]]: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>,
 // CHECK-SAME:    %[[LAYOUT:[a-zA-Z0-9]*]]: memref<2xi32, strided<[?], offset: ?>>,
 func.func @reshape_with_non_identity_layout(%arg0: memref<2x2xf32, strided<[?, ?], offset: ?>, 3>, %arg1: tensor<2xi32>, %idx: index) -> f32 {
-  %t = bufferization.to_tensor %arg0 restrict : memref<2x2xf32, strided<[?, ?], offset: ?>, 3>
+  %t = bufferization.to_tensor %arg0 restrict : memref<2x2xf32, strided<[?, ?], offset: ?>, 3> to tensor<2x2xf32>
 
   // CHECK: %[[SUBVIEW:.+]] = memref.subview %[[INPUT]][1, 0] [1, 2] [1, 1] : memref<2x2xf32, strided<[?, ?], offset: ?>, 3> to memref<2xf32, strided<[?], offset: ?>, 3>
   %extracted_slice = tensor.extract_slice %t[1, 0] [1, 2] [1, 1] : tensor<2x2xf32> to tensor<2xf32>
diff --git a/mlir/test/Dialect/Vector/bufferize.mlir b/mlir/test/Dialect/Vector/bufferize.mlir
index 3399f60a2c3bf3..c2abebe706ac0e 100644
--- a/mlir/test/Dialect/Vector/bufferize.mlir
+++ b/mlir/test/Dialect/Vector/bufferize.mlir
@@ -2,7 +2,7 @@
 
 // CHECK-LABEL: func @transfer_read(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[o1:.*]]: index, %[[o2:.*]]: index, %[[pad:.*]]: f32)
-//       CHECK:   %[[m:.*]] = bufferization.to_memref %[[t]] : memref<?x?xf32>
+//       CHECK:   %[[m:.*]] = bufferization.to_memref %[[t]] : tensor<?x?xf32> to memref<?x?xf32>
 //       CHECK:   %[[r:.*]] = vector.transfer_read %[[m]][%[[o1]], %[[o2]]], %[[pad]] {in_bounds = [true, false]} : memref<?x?xf32>, vector<5x6xf32>
 //       CHECK:   return %[[r]]
 func.func @transfer_read(%t: tensor<?x?xf32>, %o1: index,
@@ -16,7 +16,7 @@ func.func @transfer_read(%t: tensor<?x?xf32>, %o1: index,
 
 // CHECK-LABEL: func @transfer_write(
 //  CHECK-SAME:     %[[t:.*]]: tensor<?x?xf32>, %[[o1:.*]]: index, %[[o2:.*]]: index, %[[vec:.*]]: vector<5x6xf32>, %[[mask:.*]]: vector<5x6xi1>)
-//       CHECK:   %[[m:.*]] = bufferization.to_memref %[[t]] : memref<?x?xf32>
+//       CHECK:   %[[m:.*]] = bufferization.to_memref %[[t]] : tensor<?x?xf32> to memref<?x?xf32>
 //       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}}, %{{.*}}) {{.*}} : memref<?x?xf32>
 //       CHECK:   memref.copy %[[m]], %[[alloc]]
 //       CHECK:   vector.transfer_write %[[vec]], %[[alloc]][%[[o1]], %[[o2]]], %[[mask]] {in_bounds = [true, false]} : vector<5x6xf32>, memref<?x?xf32>
@@ -35,7 +35,7 @@ func.func @transfer_write(%t: tensor<?x?xf32>, %o1: index,
 // CHECK-LABEL: func @gather(
 //  CHECK-SAME:     %[[base:.*]]: tensor<?x?xf32>, %[[v:.*]]: vector<16xi32>,
 //  CHECK-SAME:     %[[mask:.*]]: vector<16xi1>, %[[pass_thru:.*]]: vector<16xf32>)
-//       CHECK:   %[[m:.*]] = bufferization.to_memref %[[base]] : memref<?x?xf32>
+//       CHECK:   %[[m:.*]] = bufferization.to_memref %[[base]] : tensor<?x?xf32> to memref<?x?xf32>
 //       CHECK:   %[[c0:.*]] = arith.constant 0 : index
 //       CHECK:   %[[out:.*]] = vector.gather %[[m]][%[[c0]], %[[c0]]] [%[[v]]], %[[mask]], %[[pass_thru]] : memref<?x?xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 func.func @gather(%base: tensor<?x?xf32>, %v: vector<16xi32>, %mask: vector<16xi1>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
index a2afc4d13943e1..415697aaaba5d8 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-matvec-const.mlir
@@ -47,7 +47,7 @@ module {
 
     // Call the kernel with an vector taken from global memory.
     %xbuf = memref.get_global @__constant_64xf64 : memref<64xf64>
-    %x = bufferization.to_tensor %xbuf restrict : memref<64xf64>
+    %x = bufferization.to_tensor %xbuf restrict : memref<64xf64> to tensor<64xf64>
     %0 = call @matvec(%A, %x, %y) : (tensor<1024x64xf64, #CSR>, tensor<64xf64>, tensor<1024xf64>) -> tensor<1024xf64>
 
     //
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
index 5a624e64342974..6ae54d65ea37b5 100644
--- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-mma-2-4-f16.mlir
@@ -2,7 +2,7 @@
 // NOTE: this test requires gpu-sm80
 //
 // RUN: mlir-opt \
-// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
+// RUN: --pass-pipeline="builtin.module(gpu.module(strip-debuginfo,convert-gpu-to-nvvm,convert-nvgpu-to-nvvm,affine-expand-index-ops,lower-affine,arith-expand,convert-arith-to-llvm),convert-vector-to-llvm,canonicalize,cse)" \
 // RUN: %s \
 // RUN: | mlir-opt --gpu-lower-to-nvvm-pipeline="cubin-chip=sm_80 cubin-features=+ptx71 cubin-format=%gpu_compilation_format" \
 // RUN: | mlir-cpu-runner \
diff --git a/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir
index 05a78e32b9e115..1b5fc9070ef6cc 100644
--- a/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir
+++ b/mlir/test/Integration/Dialect/Tosa/CPU/test-maxpool-dynamic.mlir
@@ -54,7 +54,7 @@ func.func @main() {
   %result_static  = func.call @max_pool_static(%A) : (!tensor_type) -> !tensor_type
   %result_dynamic = func.call @max_pool_dynamic(%A_dynamic) : (tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
 
-  %static_buffer = bufferization.to_memref %result_static : !memref_type
+  %static_buffer = bufferization.to_memref %result_static : !tensor_type to !memref_type
   %unranked_static_buffer = memref.cast %static_buffer : !memref_type to memref<*xf32>
 
   // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data =
@@ -81,7 +81,7 @@ func.func @main() {
 
   func.call @printMemrefF32(%unranked_static_buffer) : (memref<*xf32>) -> ()
 
-  %dynamic_buffer = bufferization.to_memref %result_dynamic : memref<?x?x?x?xf32>
+  %dynamic_buffer = bufferization.to_memref %result_dynamic : tensor<?x?x?x?xf32> to memref<?x?x?x?xf32>
   %unranked_dynamic_buffer = memref.cast %dynamic_buffer : memref<?x?x?x?xf32> to memref<*xf32>
 
   // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 4, 4, 1] strides = [16, 4, 1, 1] data =
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir
index a7c5b91273423b..8cf15cd6978682 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/mulf-full.mlir
@@ -100,8 +100,8 @@ func.func @entry() -> i32 {
   ]> : tensor<16x32xbf16>
 
   // Set up memory.
-  %a = bufferization.to_memref %0 : memref<16x32xbf16>
-  %b = bufferization.to_memref %1 : memref<16x32xbf16>
+  %a = bufferization.to_memref %0 : tensor<16x32xbf16> to memref<16x32xbf16>
+  %b = bufferization.to_memref %1 : tensor<16x32xbf16> to memref<16x32xbf16>
   %c = memref.alloc() : memref<16x16xf32>
 
   // Call kernel.
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir b/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir
index 7b7ee54db8c348..652ba0698c4c9c 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/AMX/muli-full.mlir
@@ -100,8 +100,8 @@ func.func @entry() -> i32 {
   ]> : tensor<16x64xi8>
 
   // Set up memory.
-  %a = bufferization.to_memref %0 : memref<16x64xi8>
-  %b = bufferization.to_memref %1 : memref<16x64xi8>
+  %a = bufferization.to_memref %0 : tensor<16x64xi8> to memref<16x64xi8>
+  %b = bufferization.to_memref %1 : tensor<16x64xi8> to memref<16x64xi8>
   %c = memref.alloc() : memref<16x16xi32>
 
   // Call kernel.

From 2a0162c0193a73ef16aee67dbe66abb9d2e4717c Mon Sep 17 00:00:00 2001
From: SpencerAbson <Spencer.Abson@arm.com>
Date: Tue, 26 Nov 2024 16:50:51 +0000
Subject: [PATCH 29/41] [AArch64][SVE] Change the immediate argument in svextq
 (#115340)

In order to align with `svext` and NEON `vext`/`vextq`, this patch
changes immediate argument in `svextq` such that it refers to elements
of the size of those of the source vector, rather than bytes. The [spec
for this
intrinsic](https://github.com/ARM-software/acle/blob/main/main/acle.md#extq)
is ambiguous about the meaning of this argument, this issue was raised
after there was a differing interpretation for it from the implementers
of the ACLE in GCC.

For example (with our current implementation):

`svextq_f64(zn_f64, zm_f64, 1)` would, for each 128-bit segment of
`zn_f64,` concatenate the highest 15 bytes of this segment with the
first byte of the corresponding segment of `zm_f64`.

After this patch, the behavior of `svextq_f64(zn_f64, zm_f64, 1)` would
be, for each 128-bit vector segment of `zn_f64`, to concatenate the
higher doubleword of this segment with the lower doubleword of the
corresponding segment of `zm_f64`.

The range of the immediate argument in `svextq` would be modified such
that it is:
- [0,15] for `svextq_{s8,u8}`
- [0,7] for `svextq_{s16,u16,f16,bf16}`
- [0,3] for `svextq_{s32,u32,f32}`
- [0,1] for `svextq_{s64,u64,f64}`
---
 clang/include/clang/Basic/arm_sve.td          |  2 +-
 .../sve2p1-intrinsics/acle_sve2p1_extq.c      | 42 +++++++++---------
 .../acle_sve2p1_imm.cpp                       | 44 +++++++++++++++++--
 .../lib/Target/AArch64/AArch64InstrFormats.td | 33 ++++++++++++++
 llvm/lib/Target/AArch64/SVEInstrFormats.td    | 17 ++++---
 .../CodeGen/AArch64/sve2p1-intrinsics-extq.ll | 28 ++++++------
 6 files changed, 118 insertions(+), 48 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index d492fae4145b92..c6b7cd637b9ece 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -2286,7 +2286,7 @@ let SVETargetGuard = "sve2p1", SMETargetGuard = InvalidMode in {
   def SVTBLQ : SInst<"svtblq[_{d}]", "ddu", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_tblq">;
   def SVTBXQ : SInst<"svtbxq[_{d}]", "dddu", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_tbxq">;
   // EXTQ
-  def EXTQ : SInst<"svextq[_{d}]", "dddk", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_extq", [], [ImmCheck<2, ImmCheck0_15>]>;
+  def EXTQ : SInst<"svextq[_{d}]", "dddk", "cUcsUsiUilUlbhfd", MergeNone, "aarch64_sve_extq", [], [ImmCheck<2, ImmCheckLaneIndex, 0>]>;
 
   // PMOV
   // Move to Pred
diff --git a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
index 5fbfa881500ba1..06eec1e00900cc 100644
--- a/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
+++ b/clang/test/CodeGen/AArch64/sve2p1-intrinsics/acle_sve2p1_extq.c
@@ -103,111 +103,111 @@ svuint32_t test_svextq_u32(svuint32_t zn, svuint32_t zm) {
 // CHECK-LABEL: define dso_local <vscale x 4 x i32> @test_svextq_s32
 // CHECK-SAME: (<vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.extq.nxv4i32(<vscale x 4 x i32> [[ZN]], <vscale x 4 x i32> [[ZM]], i32 6)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.extq.nxv4i32(<vscale x 4 x i32> [[ZN]], <vscale x 4 x i32> [[ZM]], i32 3)
 // CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 4 x i32> @_Z15test_svextq_s32u11__SVInt32_tS_
 // CPP-CHECK-SAME: (<vscale x 4 x i32> [[ZN:%.*]], <vscale x 4 x i32> [[ZM:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.extq.nxv4i32(<vscale x 4 x i32> [[ZN]], <vscale x 4 x i32> [[ZM]], i32 6)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sve.extq.nxv4i32(<vscale x 4 x i32> [[ZN]], <vscale x 4 x i32> [[ZM]], i32 3)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
 svint32_t test_svextq_s32(svint32_t zn, svint32_t zm) {
-    return SVE_ACLE_FUNC(svextq, _s32,,)(zn, zm, 6);
+    return SVE_ACLE_FUNC(svextq, _s32,,)(zn, zm, 3);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svextq_u64
 // CHECK-SAME: (<vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 3)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 1)
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z15test_svextq_u64u12__SVUint64_tS_
 // CPP-CHECK-SAME: (<vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 3)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 1)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 svuint64_t test_svextq_u64(svuint64_t zn, svuint64_t zm) {
-  return SVE_ACLE_FUNC(svextq, _u64,,)(zn, zm, 3);
+  return SVE_ACLE_FUNC(svextq, _u64,,)(zn, zm, 1);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 2 x i64> @test_svextq_s64
 // CHECK-SAME: (<vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 7)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 0)
 // CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 2 x i64> @_Z15test_svextq_s64u11__SVInt64_tS_
 // CPP-CHECK-SAME: (<vscale x 2 x i64> [[ZN:%.*]], <vscale x 2 x i64> [[ZM:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 7)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> [[ZN]], <vscale x 2 x i64> [[ZM]], i32 0)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x i64> [[TMP0]]
 //
 svint64_t test_svextq_s64(svint64_t zn, svint64_t zm) {
-    return SVE_ACLE_FUNC(svextq, _s64,,)(zn, zm, 7);
+    return SVE_ACLE_FUNC(svextq, _s64,,)(zn, zm, 0);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 8 x half> @test_svextq_f16
 // CHECK-SAME: (<vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.extq.nxv8f16(<vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]], i32 8)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.extq.nxv8f16(<vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]], i32 7)
 // CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x half> @_Z15test_svextq_f16u13__SVFloat16_tS_
 // CPP-CHECK-SAME: (<vscale x 8 x half> [[ZN:%.*]], <vscale x 8 x half> [[ZM:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.extq.nxv8f16(<vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]], i32 8)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sve.extq.nxv8f16(<vscale x 8 x half> [[ZN]], <vscale x 8 x half> [[ZM]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
 svfloat16_t test_svextq_f16(svfloat16_t zn, svfloat16_t zm) {
-    return SVE_ACLE_FUNC(svextq, _f16,,)(zn, zm, 8);
+    return SVE_ACLE_FUNC(svextq, _f16,,)(zn, zm, 7);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 4 x float> @test_svextq_f32
 // CHECK-SAME: (<vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.extq.nxv4f32(<vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]], i32 9)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.extq.nxv4f32(<vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]], i32 2)
 // CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 4 x float> @_Z15test_svextq_f32u13__SVFloat32_tS_
 // CPP-CHECK-SAME: (<vscale x 4 x float> [[ZN:%.*]], <vscale x 4 x float> [[ZM:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.extq.nxv4f32(<vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]], i32 9)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sve.extq.nxv4f32(<vscale x 4 x float> [[ZN]], <vscale x 4 x float> [[ZM]], i32 2)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
 svfloat32_t test_svextq_f32(svfloat32_t zn, svfloat32_t zm) {
-    return SVE_ACLE_FUNC(svextq, _f32,,)(zn, zm, 9);
+    return SVE_ACLE_FUNC(svextq, _f32,,)(zn, zm, 2);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 2 x double> @test_svextq_f64
 // CHECK-SAME: (<vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.extq.nxv2f64(<vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]], i32 10)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.extq.nxv2f64(<vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]], i32 0)
 // CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 2 x double> @_Z15test_svextq_f64u13__SVFloat64_tS_
 // CPP-CHECK-SAME: (<vscale x 2 x double> [[ZN:%.*]], <vscale x 2 x double> [[ZM:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.extq.nxv2f64(<vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]], i32 10)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 2 x double> @llvm.aarch64.sve.extq.nxv2f64(<vscale x 2 x double> [[ZN]], <vscale x 2 x double> [[ZM]], i32 0)
 // CPP-CHECK-NEXT:    ret <vscale x 2 x double> [[TMP0]]
 //
 svfloat64_t test_svextq_f64(svfloat64_t zn, svfloat64_t zm) {
-    return SVE_ACLE_FUNC(svextq, _f64,,)(zn, zm, 10);
+    return SVE_ACLE_FUNC(svextq, _f64,,)(zn, zm, 0);
 }
 
 // CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @test_svextq_bf16
 // CHECK-SAME: (<vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.extq.nxv8bf16(<vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]], i32 11)
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.extq.nxv8bf16(<vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]], i32 6)
 // CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
 // CPP-CHECK-LABEL: define dso_local <vscale x 8 x bfloat> @_Z16test_svextq_bf16u14__SVBfloat16_tS_
 // CPP-CHECK-SAME: (<vscale x 8 x bfloat> [[ZN:%.*]], <vscale x 8 x bfloat> [[ZM:%.*]]) #[[ATTR0]] {
 // CPP-CHECK-NEXT:  entry:
-// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.extq.nxv8bf16(<vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]], i32 11)
+// CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sve.extq.nxv8bf16(<vscale x 8 x bfloat> [[ZN]], <vscale x 8 x bfloat> [[ZM]], i32 6)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
 svbfloat16_t test_svextq_bf16(svbfloat16_t zn, svbfloat16_t zm) {
-    return SVE_ACLE_FUNC(svextq, _bf16,,)(zn, zm, 11);
+    return SVE_ACLE_FUNC(svextq, _bf16,,)(zn, zm, 6);
 }
diff --git a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp
index f7047c99e884e9..ac7586e202b96c 100644
--- a/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp
+++ b/clang/test/Sema/aarch64-sve2p1-intrinsics/acle_sve2p1_imm.cpp
@@ -138,9 +138,47 @@ void test_svbfmul_lane(svbfloat16_t zn, svbfloat16_t zm, uint64_t idx){
 }
 
 __attribute__((target("+sve2p1")))
-void test_svextq_lane(svint16_t zn_i16, svint16_t zm_i16, svfloat16_t zn_f16, svfloat16_t zm_f16){
-  svextq_s16(zn_i16, zm_i16, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
-  svextq_f16(zn_f16, zm_f16, 16);  // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+void test_svextq_8b_offset(svint8_t s8, svuint8_t u8){
+  svextq_s8(s8, s8, -1);  // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+  svextq_u8(u8, u8, -1);  // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+
+  svextq_s8(s8, s8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  svextq_u8(u8, u8, 16);   // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+}
+
+__attribute__((target("+sve2p1")))
+void test_svextq_16b_offset(svint16_t s16, svuint16_t u16, svfloat16_t f16, svbfloat16_t bf16){
+  svextq_s16(s16, s16, -1);     // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  svextq_u16(u16, u16, -1);     // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  svextq_f16(f16, f16, -1);     // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  svextq_bf16(bf16, bf16, -1);  // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+
+  svextq_s16(s16, s16, 8);     // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svextq_u16(u16, u16, 8);     // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svextq_f16(f16, f16, 8);     // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  svextq_bf16(bf16, bf16, 8);  // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+}
+
+__attribute__((target("+sve2p1")))
+void test_svextq_32b_offset(svint32_t s32, svuint32_t u32, svfloat32_t f32){
+  svextq_s32(s32, s32, -1);   // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  svextq_u32(u32, u32, -1);   // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+  svextq_f32(f32, f32, -1);   // expected-error {{argument value -1 is outside the valid range [0, 3]}}
+
+  svextq_s32(s32, s32, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svextq_u32(u32, u32, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+  svextq_f32(f32, f32, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
+}
+
+__attribute__((target("+sve2p1")))
+void test_svextq_64b_offset(svint64_t s64, svuint64_t u64, svfloat64_t f64){
+  svextq_s64(s64, s64, -1);   // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  svextq_u64(u64, u64, -1);   // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+  svextq_f64(f64, f64, -1);   // expected-error {{argument value -1 is outside the valid range [0, 1]}}
+
+  svextq_s64(s64, s64, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svextq_u64(u64, u64, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
+  svextq_f64(f64, f64, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
 }
 
 __attribute__((target("+sve2p1")))
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 242aea5fbb0142..a8ba89f784c8cd 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -529,6 +529,18 @@ def UImmS8XForm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
 }]>;
 
+def UImmM2XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 2, SDLoc(N), MVT::i32);
+}]>;
+
+def UImmM4XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 4, SDLoc(N), MVT::i32);
+}]>;
+
+def UImmM8XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() * 8, SDLoc(N), MVT::i32);
+}]>;
+
 // uimm5sN predicate - True if the immediate is a multiple of N in the range
 // [0 * N, 32 * N].
 def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
@@ -1098,6 +1110,13 @@ def timm32_0_1 : Operand<i32>, TImmLeaf<i32, [{
   let ParserMatchClass = Imm0_1Operand;
 }
 
+// extq_timm32_0_1m8 - True if the 32-bit immediate is in the range [0,1], scale this immediate
+// by a factor of 8 after a match is made.
+def extq_timm32_0_1m8 : Operand<i32>, TImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 2;}], UImmM8XForm> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
 // timm32_1_1 - True if the 32-bit immediate is in the range [1,1]
 def timm32_1_1 : Operand<i32>, TImmLeaf<i32, [{
     return ((uint32_t)Imm) == 1;
@@ -1140,6 +1159,13 @@ def timm32_0_3 : Operand<i32>, TImmLeaf<i32, [{
   let ParserMatchClass = Imm0_3Operand;
 }
 
+// extq_timm32_0_3m4 - True if the 32-bit immediate is in the range [0,3], scale this immediate
+// by a factor of 4 after a match is made.
+def extq_timm32_0_3m4 : Operand<i32>, TImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 4;}], UImmM4XForm> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
 // timm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
 def timm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
   return ((uint32_t)Imm) < 8;
@@ -1147,6 +1173,13 @@ def timm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
   let ParserMatchClass = Imm0_7Operand;
 }
 
+// extq_timm32_0_7m2 - True if the 32-bit immediate is in the range [0,7], scale this immediate
+// by a factor of 2 after a match is made.
+def extq_timm32_0_7m2 : Operand<i32>, TImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 8;}], UImmM2XForm> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
 // timm32_1_7 predicate - True if the 32-bit immediate is in the range [1,7]
 def timm32_1_7 : Operand<i32>, TImmLeaf<i32, [{
   return ((uint32_t)Imm) > 0 && ((uint32_t)Imm) < 8;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 1ddb913f013f5e..4bdf327e0d3fc3 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10459,15 +10459,14 @@ class sve2p1_extq<string mnemonic>
 multiclass sve2p1_extq<string mnemonic, SDPatternOperator Op> {
   def NAME : sve2p1_extq<mnemonic>;
   def : SVE_3_Op_Imm_Pat<nxv16i8, Op, nxv16i8, nxv16i8, i32, timm32_0_15, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, Op, nxv8i16, nxv8i16, i32, timm32_0_15, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, Op, nxv4i32, nxv4i32, i32, timm32_0_15, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, Op, nxv2i64, nxv2i64, i32, timm32_0_15, !cast<Instruction>(NAME)>;
-
-  def : SVE_3_Op_Imm_Pat<nxv8f16, Op, nxv8f16, nxv8f16, i32, timm32_0_15, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Imm_Pat<nxv4f32, Op, nxv4f32, nxv4f32, i32, timm32_0_15, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Imm_Pat<nxv2f64, Op, nxv2f64, nxv2f64, i32, timm32_0_15, !cast<Instruction>(NAME)>;
-  def : SVE_3_Op_Imm_Pat<nxv8bf16, Op, nxv8bf16, nxv8bf16, i32, timm32_0_15, !cast<Instruction>(NAME
-)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, Op, nxv8i16, nxv8i16, i32, extq_timm32_0_7m2, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, Op, nxv4i32, nxv4i32, i32, extq_timm32_0_3m4, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, Op, nxv2i64, nxv2i64, i32, extq_timm32_0_1m8, !cast<Instruction>(NAME)>;
+
+  def : SVE_3_Op_Imm_Pat<nxv8f16, Op, nxv8f16, nxv8f16, i32, extq_timm32_0_7m2, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Imm_Pat<nxv4f32, Op, nxv4f32, nxv4f32, i32, extq_timm32_0_3m4, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Imm_Pat<nxv2f64, Op, nxv2f64, nxv2f64, i32, extq_timm32_0_1m8, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Imm_Pat<nxv8bf16, Op, nxv8bf16, nxv8bf16, i32, extq_timm32_0_7m2, !cast<Instruction>(NAME)>;
 }
 
 // SVE move predicate from vector
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-extq.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-extq.ll
index a49aa7cfcf8a2d..bb4c67fca5dc8b 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-extq.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-extq.ll
@@ -4,16 +4,16 @@
 define <vscale x 16 x i8> @test_extq_i8 (<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm) {
 ; CHECK-LABEL: test_extq_i8:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #0
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #15
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.extq.nxv16i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 0)
+  %res = call <vscale x 16 x i8> @llvm.aarch64.sve.extq.nxv16i8(<vscale x 16 x i8> %zn, <vscale x 16 x i8> %zm, i32 15)
   ret <vscale x 16 x i8> %res
 }
 
 define <vscale x 8 x i16> @test_extq_i16 (<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm) {
 ; CHECK-LABEL: test_extq_i16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #1
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #2
 ; CHECK-NEXT:    ret
   %res = call <vscale x 8 x i16> @llvm.aarch64.sve.extq.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i16> %zm, i32 1)
   ret <vscale x 8 x i16> %res
@@ -22,7 +22,7 @@ define <vscale x 8 x i16> @test_extq_i16 (<vscale x 8 x i16> %zn, <vscale x 8 x
 define <vscale x 4 x i32> @test_extq_i32 (<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm) {
 ; CHECK-LABEL: test_extq_i32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #2
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
   %res = call <vscale x 4 x i32> @llvm.aarch64.sve.extq.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i32> %zm, i32 2)
   ret <vscale x 4 x i32> %res
@@ -31,45 +31,45 @@ define <vscale x 4 x i32> @test_extq_i32 (<vscale x 4 x i32> %zn, <vscale x 4 x
 define <vscale x 2 x i64> @test_extq_i64 (<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm) {
 ; CHECK-LABEL: test_extq_i64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #3
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm, i32 3)
+  %res = call <vscale x 2 x i64> @llvm.aarch64.sve.extq.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i64> %zm, i32 1)
   ret <vscale x 2 x i64> %res
 }
 
 define <vscale x 8 x half> @test_extq_f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm) {
 ; CHECK-LABEL: test_extq_f16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #4
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #14
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x half> @llvm.aarch64.sve.extq.nxv8f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 4)
+  %res = call <vscale x 8 x half> @llvm.aarch64.sve.extq.nxv8f16(<vscale x 8 x half> %zn, <vscale x 8 x half> %zm, i32 7)
   ret <vscale x 8 x half> %res
 }
 
 define <vscale x 4 x float> @test_extq_f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm) {
 ; CHECK-LABEL: test_extq_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #5
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #4
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 4 x float> @llvm.aarch64.sve.extq.nxv4f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm, i32 5)
+  %res = call <vscale x 4 x float> @llvm.aarch64.sve.extq.nxv4f32(<vscale x 4 x float> %zn, <vscale x 4 x float> %zm, i32 1)
   ret <vscale x 4 x float> %res
 }
 
 define <vscale x 2 x double> @test_extq_f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm) {
 ; CHECK-LABEL: test_extq_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #6
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #8
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 2 x double> @llvm.aarch64.sve.extq.nxv2f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm, i32 6)
+  %res = call <vscale x 2 x double> @llvm.aarch64.sve.extq.nxv2f64(<vscale x 2 x double> %zn, <vscale x 2 x double> %zm, i32 1)
   ret <vscale x 2 x double> %res
 }
 
 define <vscale x 8 x bfloat> @test_extq_bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm) {
 ; CHECK-LABEL: test_extq_bf16:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #15
+; CHECK-NEXT:    extq z0.b, z0.b, z1.b, #6
 ; CHECK-NEXT:    ret
-  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.extq.nxv8bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 15)
+  %res = call <vscale x 8 x bfloat> @llvm.aarch64.sve.extq.nxv8bf16(<vscale x 8 x bfloat> %zn, <vscale x 8 x bfloat> %zm, i32 3)
   ret <vscale x 8 x bfloat> %res
 }
 

From bf440f75b485e4ea7f809dc37df74cac140069fd Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Tue, 26 Nov 2024 18:00:30 +0100
Subject: [PATCH 30/41] [Clang][NFC] Remove trailing whitespace from
 Attr{,Docs}.td

---
 clang/include/clang/Basic/Attr.td     | 10 ++++-----
 clang/include/clang/Basic/AttrDocs.td | 32 +++++++++++++--------------
 2 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 14009826f2c550..6db36a015acfd7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -1916,7 +1916,7 @@ public:
   }
   auto getArgIdents() const { return ArgIdents; }
   auto getArgLocs() const { return ArgLocs; }
-  void setParamIdx(size_t Idx, int Val) { 
+  void setParamIdx(size_t Idx, int Val) {
     assert(Idx < params_Size);
     params_[Idx] = Val;
   }
@@ -4637,7 +4637,7 @@ def HLSLResourceBinding: InheritableAttr {
   let AdditionalMembers = [{
   public:
       enum class RegisterType : unsigned { SRV, UAV, CBuffer, Sampler, C, I };
-  
+
   private:
       RegisterType RegType;
       unsigned SlotNumber;
@@ -4707,7 +4707,7 @@ def HLSLResource : InheritableAttr {
   let Spellings = [];
   let Subjects = SubjectList<[Struct]>;
   let LangOpts = [HLSL];
-  let Args = [    
+  let Args = [
     EnumArgument<
         "ResourceKind", "llvm::hlsl::ResourceKind",
         /*is_string=*/0,
@@ -4732,7 +4732,7 @@ def HLSLResource : InheritableAttr {
 
 def HLSLROV : TypeAttr {
   let Spellings = [CXX11<"hlsl", "is_rov">];
-  let LangOpts = [HLSL]; 
+  let LangOpts = [HLSL];
   let Documentation = [InternalOnly];
 }
 
@@ -4757,7 +4757,7 @@ def HLSLContainedType : TypeAttr {
 
 def HLSLRawBuffer : TypeAttr {
   let Spellings = [CXX11<"hlsl", "raw_buffer">];
-  let LangOpts = [HLSL]; 
+  let LangOpts = [HLSL];
   let Documentation = [InternalOnly];
 }
 
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 6fb2eb3eb3e663..cbbfedeec46cee 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3943,7 +3943,7 @@ In this case, the capturing entity ``X`` could capture a dangling reference to t
 temporary object.
 
 .. code-block:: c++
-  
+
   void addToSet(std::string_view a [[clang::lifetime_capture_by(s)]], std::set<std::string_view>& s) {
     s.insert(a);
   }
@@ -3957,8 +3957,8 @@ temporary object.
 
 The capturing entity ``X`` can be one of the following:
 
-- Another (named) function parameter. 
-  
+- Another (named) function parameter.
+
   .. code-block:: c++
 
     void addToSet(std::string_view a [[clang::lifetime_capture_by(s)]], std::set<std::string_view>& s) {
@@ -3966,7 +3966,7 @@ The capturing entity ``X`` can be one of the following:
     }
 
 - ``this`` (in case of member functions).
-  
+
   .. code-block:: c++
 
     class S {
@@ -3977,7 +3977,7 @@ The capturing entity ``X`` can be one of the following:
     };
 
 - `global`, `unknown`.
-  
+
   .. code-block:: c++
 
     std::set<std::string_view> s;
@@ -4000,7 +4000,7 @@ function by writing the attribute after the function type:
 The attribute supports specifying more than one capturing entities:
 
 .. code-block:: c++
-  
+
   void addToSets(std::string_view a [[clang::lifetime_capture_by(s1, s2)]],
                  std::set<std::string_view>& s1,
                  std::set<std::string_view>& s2) {
@@ -4014,7 +4014,7 @@ statement-local and only detects use of a temporary as an argument to the
 annotated parameter.
 
 .. code-block:: c++
-  
+
   void addToSet(std::string_view a [[clang::lifetime_capture_by(s)]], std::set<std::string_view>& s);
   void use() {
     std::set<std::string_view> s;
@@ -7174,8 +7174,8 @@ the field it is attached to, and it may also lead to emission of automatic fix-i
 hints which would help the user replace the use of unsafe functions(/fields) with safe
 alternatives, though the attribute can be used even when the fix can't be automated.
 
-* Attribute attached to functions: The attribute does not suppress 
-  ``-Wunsafe-buffer-usage`` inside the function to which it is attached. 
+* Attribute attached to functions: The attribute does not suppress
+  ``-Wunsafe-buffer-usage`` inside the function to which it is attached.
   These warnings still need to be addressed.
 
   The attribute is warranted even if the only way a function can overflow
@@ -7238,10 +7238,10 @@ alternatives, though the attribute can be used even when the fix can't be automa
   and then use the attribute on the original ``baz()`` to help the users
   update their code to use the new function.
 
-* Attribute attached to fields: The attribute should only be attached to 
-  struct fields, if the fields can not be updated to a safe type with bounds 
-  check, such as std::span. In other words, the buffers prone to unsafe accesses 
-  should always be updated to use safe containers/views and attaching the attribute 
+* Attribute attached to fields: The attribute should only be attached to
+  struct fields, if the fields can not be updated to a safe type with bounds
+  check, such as std::span. In other words, the buffers prone to unsafe accesses
+  should always be updated to use safe containers/views and attaching the attribute
   must be last resort when such an update is infeasible.
 
   The attribute can be placed on individual fields or a set of them as shown below.
@@ -7259,7 +7259,7 @@ alternatives, though the attribute can be used even when the fix can't be automa
       size_t sz;
     };
 
-  Here, every read/write to the fields ptr1, ptr2, buf and sz will trigger a warning 
+  Here, every read/write to the fields ptr1, ptr2, buf and sz will trigger a warning
   that the field has been explcitly marked as unsafe due to unsafe-buffer operations.
 
   }];
@@ -7814,10 +7814,10 @@ def HLSLLoopHintDocs : Documentation {
   let Content = [{
 The ``[loop]`` directive allows loop optimization hints to be
 specified for the subsequent loop. The directive allows unrolling to
-be disabled and is not compatible with [unroll(x)]. 
+be disabled and is not compatible with [unroll(x)].
 
 Specifying the parameter, ``[loop]``, directs the
-unroller to not unroll the loop. 
+unroller to not unroll the loop.
 
 .. code-block:: hlsl
 

From b1a34b80b83156540519110cc798969dcfe1aec9 Mon Sep 17 00:00:00 2001
From: Zaara Syeda <syzaara@ca.ibm.com>
Date: Tue, 26 Nov 2024 12:09:49 -0500
Subject: [PATCH 31/41] [NFC][Test] Fix PowerPC test gcov_ctr_ref_init.ll
 (#117577)

---
 llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll
index 4969aec0a14942..fc7dacd873b07d 100644
--- a/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll
+++ b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll
@@ -12,11 +12,16 @@ target triple = "powerpc-ibm-aix"
 ; CHECK-NEXT: L..__llvm_covinit_functions:
 ; CHECK-NEXT:     .vbyte  4, __llvm_gcov_writeout[DS]
 ; CHECK-NEXT:     .vbyte  4, __llvm_gcov_reset[DS]
-; The first .csect directive below is specifying the content of the csect.
-; The second .csect directive below is used to insert the .ref pseudo
-; instruction.
-; CHECK:    .csect __llvm_gcov_ctr_section[RW],3
 ; CHECK:    .csect __llvm_gcov_ctr_section[RW],3
+; CHECK-NEXT:    .lglobl __llvm_gcov_ctr                 # @_MergedGlobals
+; CHECK-NEXT:    .lglobl __llvm_gcov_ctr.1
+; CHECK-NEXT:    .align  3
+; CHECK-NEXT: L.._MergedGlobals:
+; CHECK-NEXT: __llvm_gcov_ctr:
+; CHECK-NEXT:     .space  8
+; CHECK-NEXT: __llvm_gcov_ctr.1:
+; CHECK-NEXT:     .space  8
+; CHECK:     .csect __llvm_gcov_ctr_section[RW],3
 ; CHECK-RW-NEXT:    .ref __llvm_covinit[RW]
 ; CHECK-RO-NEXT:    .ref __llvm_covinit[RO]
 

From c55a080c080ed76a9aabe6dcd1966fedc0ecda5a Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 26 Nov 2024 09:12:57 -0800
Subject: [PATCH 32/41] [RISCV] Add shuffle coverage for compress, decompress,
 and repeat idioms

compress is intented to match vcompress from the ISA manual. Note that
  deinterleave is a subset of this, and is already tested elsewhere.

decompress is the synthetic pattern defined in same - though we can often
  do better than the mentioned iota/vrgather.  Note that some of these
  can also be expressed as interleave with at least one undef source,
  and is already tested elsewhere.

repeat repeats each input element N times in the output.  It can be
  described as as a interleave operations, but we can sometimes do
  better lowering wise.
---
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   | 197 ++++++++++++++++++
 1 file changed, 197 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index e46587f58b4eb6..dbfe7bb51dbffa 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -744,3 +744,200 @@ define <8 x i8> @shuffle_v64i8_v8i8(<64 x i8> %wide.vec) {
   %s = shufflevector <64 x i8> %wide.vec, <64 x i8> poison, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
   ret <8 x i8> %s
 }
+
+define <8 x i8> @shuffle_compress_singlesrc_e8(<8 x i8> %v) {
+; CHECK-LABEL: shuffle_compress_singlesrc_e8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI49_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI49_0)
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vle8.v v10, (a0)
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 5, i32 7, i32 undef, i32 undef, i32 undef>
+  ret <8 x i8> %out
+}
+
+define <8 x i16> @shuffle_compress_singlesrc_e16(<8 x i16> %v) {
+; CHECK-LABEL: shuffle_compress_singlesrc_e16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI50_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI50_0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv.v.v v8, v9
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i16> %v, <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 5, i32 7, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %out
+}
+
+define <8 x i32> @shuffle_compress_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_compress_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI51_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI51_0)
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 6, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %out
+}
+
+define <8 x i64> @shuffle_compress_singlesrc_e64(<8 x i64> %v) {
+; CHECK-LABEL: shuffle_compress_singlesrc_e64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI52_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI52_0)
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vle16.v v16, (a0)
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i64> %v, <8 x i64> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 5, i32 7, i32 undef, i32 undef, i32 undef>
+  ret <8 x i64> %out
+}
+
+define <8 x i32> @shuffle_compress_singlesrc_gaps_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_compress_singlesrc_gaps_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI53_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI53_0)
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 4, i32 5, i32 7, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %out
+}
+
+define <8 x i32> @shuffle_decompress2_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_decompress2_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vwaddu.vv v10, v8, v8
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vwmaccu.vx v10, a0, v8
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 2, i32 undef, i32 3, i32 undef>
+  ret <8 x i32> %out
+}
+
+define <8 x i32> @shuffle_decompress3_singlesrc_e32(<8 x i32> %v) {
+; RV32-LABEL: shuffle_decompress3_singlesrc_e32:
+; RV32:       # %bb.0:
+; RV32-NEXT:    lui a0, %hi(.LCPI55_0)
+; RV32-NEXT:    addi a0, a0, %lo(.LCPI55_0)
+; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV32-NEXT:    vle16.v v12, (a0)
+; RV32-NEXT:    vrgatherei16.vv v10, v8, v12
+; RV32-NEXT:    vmv.v.v v8, v10
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: shuffle_decompress3_singlesrc_e32:
+; RV64:       # %bb.0:
+; RV64-NEXT:    lui a0, 32769
+; RV64-NEXT:    slli a0, a0, 21
+; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; RV64-NEXT:    vmv.v.x v12, a0
+; RV64-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; RV64-NEXT:    vrgatherei16.vv v10, v8, v12
+; RV64-NEXT:    vmv.v.v v8, v10
+; RV64-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 2, i32 undef>
+  ret <8 x i32> %out
+}
+
+; TODO: This should be a single vslideup.vi
+define <8 x i32> @shuffle_decompress4_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_decompress4_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsrl.vi v12, v10, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %out
+}
+
+; TODO: This should be either a single vslideup.vi or two widening interleaves.
+define <8 x i8> @shuffle_decompress4_singlesrc_e8(<8 x i8> %v) {
+; CHECK-LABEL: shuffle_decompress4_singlesrc_e8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
+; CHECK-NEXT:    vid.v v9
+; CHECK-NEXT:    vsrl.vi v10, v9, 2
+; CHECK-NEXT:    vrgather.vv v9, v8, v10
+; CHECK-NEXT:    vmv1r.v v8, v9
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i8> %v, <8 x i8> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 1, i32 undef, i32 undef, i32 undef>
+  ret <8 x i8> %out
+}
+
+define <8 x i32> @shuffle_decompress_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_decompress_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI58_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI58_0)
+; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
+; CHECK-NEXT:    vle16.v v12, (a0)
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 undef, i32 1, i32 undef, i32 3, i32 undef, i32 undef, i32 4>
+  ret <8 x i32> %out
+}
+
+define <8 x i32> @shuffle_repeat2_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_repeat2_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vwaddu.vv v10, v8, v8
+; CHECK-NEXT:    li a0, -1
+; CHECK-NEXT:    vwmaccu.vx v10, a0, v8
+; CHECK-NEXT:    vmv2r.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  ret <8 x i32> %out
+}
+
+define <8 x i32> @shuffle_repeat3_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_repeat3_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 7
+; CHECK-NEXT:    vmv.v.i v11, 1
+; CHECK-NEXT:    li a0, 192
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vmerge.vim v11, v11, 0, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vim v12, v11, 2, v0
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 2, i32 2>
+  ret <8 x i32> %out
+}
+
+define <8 x i32> @shuffle_repeat4_singlesrc_e32(<8 x i32> %v) {
+; CHECK-LABEL: shuffle_repeat4_singlesrc_e32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsrl.vi v12, v10, 2
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %out = shufflevector <8 x i32> %v, <8 x i32> poison, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %out
+}

From 5a3299a684d7d8c40f48d732e5b80a8bd29aa882 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 26 Nov 2024 12:59:15 -0500
Subject: [PATCH 33/41] AMDGPU: Remove some -verify-machineinstrs from tests
 (#117736)

We should leave these for EXPENSIVE_CHECKS builds. Some of these
were near the top of slowest tests.
---
 .../atomic_optimizations_local_pointer.ll     | 28 ++++++++---------
 llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll  | 30 +++++++++----------
 llvm/test/CodeGen/AMDGPU/flat-scratch.ll      | 20 ++++++-------
 llvm/test/CodeGen/AMDGPU/flat_atomics.ll      |  6 ++--
 .../CodeGen/AMDGPU/flat_atomics_i32_system.ll |  6 ++--
 llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll  |  6 ++--
 .../CodeGen/AMDGPU/flat_atomics_i64_system.ll |  6 ++--
 .../flat_atomics_i64_system_noprivate.ll      |  6 ++--
 llvm/test/CodeGen/AMDGPU/fmaximum.ll          |  4 +--
 llvm/test/CodeGen/AMDGPU/fmaxnum.ll           |  4 +--
 llvm/test/CodeGen/AMDGPU/fminimum.ll          |  4 +--
 llvm/test/CodeGen/AMDGPU/fminnum.ll           |  4 +--
 llvm/test/CodeGen/AMDGPU/fmul.ll              |  4 +--
 llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll |  4 +--
 llvm/test/CodeGen/AMDGPU/global-alias.ll      |  2 +-
 .../CodeGen/AMDGPU/global-smrd-unknown.ll     |  2 +-
 .../AMDGPU/global_atomics_scan_fadd.ll        | 24 +++++++--------
 llvm/test/CodeGen/AMDGPU/idot2.ll             | 12 ++++----
 llvm/test/CodeGen/AMDGPU/idot4s.ll            | 14 ++++-----
 llvm/test/CodeGen/AMDGPU/idot4u.ll            | 14 ++++-----
 llvm/test/CodeGen/AMDGPU/idot8s.ll            | 16 +++++-----
 llvm/test/CodeGen/AMDGPU/idot8u.ll            | 12 ++++----
 .../CodeGen/AMDGPU/insert_vector_dynelt.ll    |  2 +-
 llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll |  4 +--
 llvm/test/CodeGen/AMDGPU/load-global-f32.ll   |  6 ++--
 llvm/test/CodeGen/AMDGPU/load-global-f64.ll   |  6 ++--
 llvm/test/CodeGen/AMDGPU/load-global-i1.ll    |  4 +--
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   | 10 +++----
 llvm/test/CodeGen/AMDGPU/load-global-i32.ll   | 10 +++----
 llvm/test/CodeGen/AMDGPU/load-global-i64.ll   |  6 ++--
 llvm/test/CodeGen/AMDGPU/load-global-i8.ll    | 10 +++----
 31 files changed, 143 insertions(+), 143 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 9c2527ae4781bd..45b161d7959f4f 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -1,18 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
-; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
+; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132,GFX1132_DPP %s
 
 declare i32 @llvm.amdgcn.workitem.id.x()
 
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 02eb1ad9453291..87d63f4f9cd902 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefix=CI -check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=kaveri | FileCheck -check-prefix=CI -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=fiji -mattr=-xnack | FileCheck -check-prefix=VI-NOXNACK -check-prefix=GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=-xnack | FileCheck -check-prefixes=VI-NOXNACK,GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack | FileCheck -check-prefix=VI-XNACK  -check-prefix=GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack -verify-machineinstrs | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
 
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch -verify-machineinstrs | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack -verify-machineinstrs | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
 
 ; GCN-LABEL: {{^}}no_vcc_no_flat:
 
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
index 8290942e46e6a1..97d642b991f705 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX10 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX12 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefix=GFX9-PAL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940 %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-PAL %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-PAL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX10 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx900 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefix=GFX9-PAL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -mattr=-promote-alloca < %s | FileCheck -check-prefixes=GFX940 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1010 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1010-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1030 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX10-PAL,GFX1030-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX11-PAL %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1200 -mattr=-promote-alloca -mattr=+enable-flat-scratch < %s | FileCheck --check-prefixes=GFX12-PAL %s
 
 define amdgpu_kernel void @zero_init_kernel() {
 ; GFX9-LABEL: zero_init_kernel:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
index 6727ebd10b92d5..e674b57aae3efc 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
 
 define amdgpu_kernel void @atomic_add_i32_offset(ptr %out, i32 %in) {
 ; GCN1-LABEL: atomic_add_i32_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 9b11929de2c910..1311560715ddd7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefixes=GCN1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefixes=GCN2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
 
 ; ---------------------------------------------------------------------
 ; atomicrmw xchg
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index f5433ca4da4cd4..b4d7ff8e7c526e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
 
 define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
 ; GCN1-LABEL: atomic_add_i64_offset:
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
index 2989f08ac56e7b..36bddb7ac2fd68 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN1 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN2 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN3 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN3 %s
 
 ; ---------------------------------------------------------------------
 ; atomicrmw xchg
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
index 604fc732e7e1cd..fe47461ebf9569 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
 
 ; ---------------------------------------------------------------------
 ; atomicrmw xchg
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index 11b7c51ef2d516..fe8150b3c21c4f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL %s
 
 define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
 ; GCN-LABEL: test_fmaximum_f32_vv:
diff --git a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
index 38640a18b5aee6..09483194f08892 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}test_fmax_f32_ieee_mode_on:
 ; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index 3bd82eca8ce955..ba536aade8c49c 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG %s
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL %s
 
 define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
 ; GCN-LABEL: test_fminimum_f32_vv:
diff --git a/llvm/test/CodeGen/AMDGPU/fminnum.ll b/llvm/test/CodeGen/AMDGPU/fminnum.ll
index 65b311845a6b77..536c67561a0539 100644
--- a/llvm/test/CodeGen/AMDGPU/fminnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminnum.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}test_fmin_f32_ieee_mode_on:
 ; GCN: v_mul_f32_e64 [[QUIET0:v[0-9]+]], 1.0, s{{[0-9]+}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmul.ll b/llvm/test/CodeGen/AMDGPU/fmul.ll
index cedf7c43ff7cf8..63dc674acd9e14 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}fmul_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
index ab380dbef107ad..d8ea0ddf77b7a1 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx950.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 -verify-machineinstrs | FileCheck %s -check-prefix=GFX950-SDAG
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 -verify-machineinstrs | FileCheck %s -check-prefix=GFX950-GISEL
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=0 | FileCheck %s -check-prefix=GFX950-SDAG
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx950 -global-isel=1 | FileCheck %s -check-prefix=GFX950-GISEL
 
 declare <2 x bfloat> @llvm.amdgcn.struct.buffer.atomic.fadd.v2bf16(<2 x bfloat>, <4 x i32>, i32, i32, i32, i32 immarg)
 declare <2 x bfloat> @llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16(<2 x bfloat> %val, <4 x i32> %rsrc, i32, i32, i32)
diff --git a/llvm/test/CodeGen/AMDGPU/global-alias.ll b/llvm/test/CodeGen/AMDGPU/global-alias.ll
index 5c1c4977cabb0e..334e6e2b617e00 100644
--- a/llvm/test/CodeGen/AMDGPU/global-alias.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-alias.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=amdgcn %s -o - | FileCheck %s
 
 @foo_a = alias void (ptr), ptr @foo
 @bar_a = alias void (ptr), ptr @foo_a
diff --git a/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll b/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll
index b38758bae537d8..f7437de216cc2b 100644
--- a/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-smrd-unknown.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji  -memdep-block-scan-limit=1 -amdgpu-scalarize-global-loads -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji  -memdep-block-scan-limit=1 -amdgpu-scalarize-global-loads < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}unknown_memdep_analysis:
 ; GCN: flat_load_dword
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 30ae461d5de5ad..fbe06b3651b06c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1,16 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
-; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
+; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
 
 declare float @div.float.value()
 declare double @div.double.value()
diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll
index 013eb8f59bbb9c..b443e654350c5e 100644
--- a/llvm/test/CodeGen/AMDGPU/idot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot2.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-NODL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX9-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck --check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck --check-prefixes=GFX9-NODL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck --check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck --check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck --check-prefixes=GFX10-DL %s
 
 ; add(mul(S0.x, S1.y),
 ;     add (mul (S0.y, S1.y), S3)) -> v_dot2_{I|U}32_{I|U}16(S1, S2, S3)
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 1b181643b3d469..dd29970af52fde 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-NODL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @idot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot4_acc32:
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index 23e19bbe97153b..8f82348d350e0a 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-NODL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-NODL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11-DL %s
 
 define amdgpu_kernel void @udot4_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot4_acc32:
diff --git a/llvm/test/CodeGen/AMDGPU/idot8s.ll b/llvm/test/CodeGen/AMDGPU/idot8s.ll
index 25aa623295fe16..add62a5c39cb14 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8s.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL-XNACK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1031 < %s | FileCheck -check-prefixes=GFX10-DL-NOXNACK %s
 
 define amdgpu_kernel void @idot8_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: idot8_acc32:
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index d8491f322e69a0..069bebdf3c469d 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx803 < %s | FileCheck -check-prefixes=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefixes=GFX9-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 < %s | FileCheck -check-prefixes=GFX10-DL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 < %s | FileCheck -check-prefixes=GFX10-DL %s
 
 define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
 ; GFX7-LABEL: udot8_acc32:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
index 6bafaad582901b..7912d1cf8dc0d1 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN %s
 
 define amdgpu_kernel void @float4_inselt(ptr addrspace(1) %out, <4 x float> %vec, i32 %sel) {
 ; GCN-LABEL: float4_inselt:
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 3aa0437f0466e3..72cda5c718f5b2 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
-; RUN: llc -verify-machineinstrs -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri -mattr=-flat-for-global,+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
index 7b1355425729e9..ca24b78f62c2e5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -1,6 +1,6 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-HSA,FUNC,GCNX3-HSA %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,GCNX3-NOHSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,SI-NOHSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC,GCNX3-HSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC,GCNX3-NOHSA %s
 
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=R600,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman < %s | FileCheck --check-prefixes=R600,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
index ed3618dfd64745..31998b466919c5 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -1,6 +1,6 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-HSA,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn-amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
 
 ; FUNC-LABEL: {{^}}global_load_f64:
 ; GCN-NOHSA: buffer_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
index dac928d70c6502..92add00f84b40e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s
 ; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}global_load_i1:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 7411712da31bd8..64f1f45bf734cf 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-HSA %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-VI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-HSA %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=GCN-NOHSA-VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=EG %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap --check-prefix=CM %s
 
 ; FIXME: r600 is broken because the bigger testcases spill and it's not implemented
 
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 629343b47bc16f..8f6a1f8c01ec34 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI-NOHSA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=GCNX3-HSA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn < %s | FileCheck -check-prefix=SI-NOHSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCNX3-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNX3-NOHSA %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX900-HSA %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN-HSA -check-prefix=GCN-GFX908-HSA %s
 
 define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
 ; SI-NOHSA-LABEL: global_load_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
index 61ccc17d59eb00..00a2435e39207a 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -1,6 +1,6 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-HSA,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck --check-prefixes=GCN-HSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN-NOHSA,FUNC %s
 
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck --check-prefixes=EG,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman < %s | FileCheck --check-prefixes=EG,FUNC %s
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
index add5f13bd2d996..fb34b5e1f3af6e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -1,8 +1,8 @@
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-HSA,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn--amdhsa -mcpu=kaveri < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-HSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -check-prefixes=GCN,GCN-NOHSA,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=redwood < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=r600 -mcpu=cayman < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=EG -check-prefix=FUNC %s
 
 
 ; FUNC-LABEL: {{^}}global_load_i8:

From 4ab298b5fbc8f48387062b2dd99ea07127c02e6b Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Tue, 26 Nov 2024 10:20:52 -0800
Subject: [PATCH 34/41] [LLDB][ThreadELFCore] Set all the properties of
 ELFLinuxSigInfo to a non build dependent size (#117604)

On #110065 the changes to LinuxSigInfo Struct introduced some variables
that will differ in size on 32b or 64b. I've rectified this by setting
them all to build independent types.
---
 lldb/source/Plugins/Process/elf-core/ThreadElfCore.h | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/lldb/source/Plugins/Process/elf-core/ThreadElfCore.h b/lldb/source/Plugins/Process/elf-core/ThreadElfCore.h
index 4ebbaadebe9f90..6f8d41351a6bfb 100644
--- a/lldb/source/Plugins/Process/elf-core/ThreadElfCore.h
+++ b/lldb/source/Plugins/Process/elf-core/ThreadElfCore.h
@@ -83,10 +83,10 @@ struct ELFLinuxSigInfo {
   int32_t si_errno;
   int32_t si_code;
   // Copied from siginfo_t so we don't have to include signal.h on non 'Nix
-  // builds.
-  struct {
-    lldb::addr_t si_addr;  /* faulting insn/memory ref. */
-    short int si_addr_lsb; /* Valid LSB of the reported address.  */
+  // builds. Slight modifications to ensure no 32b vs 64b differences.
+  struct alignas(8) {
+    lldb::addr_t si_addr; /* faulting insn/memory ref. */
+    int16_t si_addr_lsb;  /* Valid LSB of the reported address.  */
     union {
       /* used when si_code=SEGV_BNDERR */
       struct {
@@ -98,7 +98,8 @@ struct ELFLinuxSigInfo {
     } bounds;
   } sigfault;
 
-  enum { eUnspecified, eNT_SIGINFO } note_type;
+  enum SigInfoNoteType : uint8_t { eUnspecified, eNT_SIGINFO };
+  SigInfoNoteType note_type;
 
   ELFLinuxSigInfo();
 

From 5fd4f32f985f83414d82a1c2c55741e363693352 Mon Sep 17 00:00:00 2001
From: Zhengxing li <zhengxingli@microsoft.com>
Date: Tue, 26 Nov 2024 10:45:31 -0800
Subject: [PATCH 35/41] [HLSL] Implement SV_GroupID semantic (#115911)

Support SV_GroupID attribute.
Translate it into dx.group.id in clang codeGen.

Fixes: #70120
---
 clang/include/clang/Basic/Attr.td             |  7 ++++
 clang/include/clang/Basic/AttrDocs.td         | 10 ++++++
 clang/include/clang/Sema/SemaHLSL.h           |  4 +++
 clang/lib/CodeGen/CGHLSLRuntime.cpp           |  4 +++
 clang/lib/Parse/ParseHLSL.cpp                 |  1 +
 clang/lib/Sema/SemaDeclAttr.cpp               |  3 ++
 clang/lib/Sema/SemaHLSL.cpp                   | 27 +++++++++++-----
 .../CodeGenHLSL/semantics/SV_GroupID.hlsl     | 32 +++++++++++++++++++
 .../SemaHLSL/Semantics/entry_parameter.hlsl   | 11 ++++---
 .../Semantics/invalid_entry_parameter.hlsl    | 22 +++++++++++++
 .../Semantics/valid_entry_parameter.hlsl      | 25 +++++++++++++++
 11 files changed, 134 insertions(+), 12 deletions(-)
 create mode 100644 clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 6db36a015acfd7..b055cbd769bb50 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4621,6 +4621,13 @@ def HLSLNumThreads: InheritableAttr {
   let Documentation = [NumThreadsDocs];
 }
 
+def HLSLSV_GroupID: HLSLAnnotationAttr {
+  let Spellings = [HLSLAnnotation<"SV_GroupID">];
+  let Subjects = SubjectList<[ParmVar, Field]>;
+  let LangOpts = [HLSL];
+  let Documentation = [HLSLSV_GroupIDDocs];
+}
+
 def HLSLSV_GroupIndex: HLSLAnnotationAttr {
   let Spellings = [HLSLAnnotation<"SV_GroupIndex">];
   let Subjects = SubjectList<[ParmVar, GlobalVar]>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index cbbfedeec46cee..aafd4449e47004 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -7934,6 +7934,16 @@ randomized.
   }];
 }
 
+def HLSLSV_GroupIDDocs : Documentation {
+  let Category = DocCatFunction;
+  let Content = [{
+The ``SV_GroupID`` semantic, when applied to an input parameter, specifies which
+thread group a shader is executing in. This attribute is only supported in compute shaders.
+
+The full documentation is available here: https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/sv-groupid
+  }];
+}
+
 def HLSLSV_GroupIndexDocs : Documentation {
   let Category = DocCatFunction;
   let Content = [{
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index 06c541dec08cc8..ee685d95c96154 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -119,6 +119,7 @@ class SemaHLSL : public SemaBase {
   void handleNumThreadsAttr(Decl *D, const ParsedAttr &AL);
   void handleWaveSizeAttr(Decl *D, const ParsedAttr &AL);
   void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL);
+  void handleSV_GroupIDAttr(Decl *D, const ParsedAttr &AL);
   void handlePackOffsetAttr(Decl *D, const ParsedAttr &AL);
   void handleShaderAttr(Decl *D, const ParsedAttr &AL);
   void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL);
@@ -136,6 +137,9 @@ class SemaHLSL : public SemaBase {
 
   bool CheckCompatibleParameterABI(FunctionDecl *New, FunctionDecl *Old);
 
+  // Diagnose whether the input ID is uint/unit2/uint3 type.
+  bool diagnoseInputIDType(QualType T, const ParsedAttr &AL);
+
   ExprResult ActOnOutParamExpr(ParmVarDecl *Param, Expr *Arg);
 
   QualType getInoutParameterType(QualType Ty);
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 7ba0d615018181..2c293523fca8ca 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -389,6 +389,10 @@ llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B,
         CGM.getIntrinsic(getThreadIdIntrinsic());
     return buildVectorInput(B, ThreadIDIntrinsic, Ty);
   }
+  if (D.hasAttr<HLSLSV_GroupIDAttr>()) {
+    llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(Intrinsic::dx_group_id);
+    return buildVectorInput(B, GroupIDIntrinsic, Ty);
+  }
   assert(false && "Unhandled parameter attribute");
   return nullptr;
 }
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index 46a37e94353533..4de342b63ed802 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -280,6 +280,7 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs,
   case ParsedAttr::UnknownAttribute:
     Diag(Loc, diag::err_unknown_hlsl_semantic) << II;
     return;
+  case ParsedAttr::AT_HLSLSV_GroupID:
   case ParsedAttr::AT_HLSLSV_GroupIndex:
   case ParsedAttr::AT_HLSLSV_DispatchThreadID:
     break;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 146d9c86e0715a..53cc8cb6afd7dc 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7103,6 +7103,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
   case ParsedAttr::AT_HLSLWaveSize:
     S.HLSL().handleWaveSizeAttr(D, AL);
     break;
+  case ParsedAttr::AT_HLSLSV_GroupID:
+    S.HLSL().handleSV_GroupIDAttr(D, AL);
+    break;
   case ParsedAttr::AT_HLSLSV_GroupIndex:
     handleSimpleAttribute<HLSLSV_GroupIndexAttr>(S, D, AL);
     break;
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 8109c3a2cc0f1b..8b2f24a8e4be0a 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -434,6 +434,7 @@ void SemaHLSL::CheckSemanticAnnotation(
   switch (AnnotationAttr->getKind()) {
   case attr::HLSLSV_DispatchThreadID:
   case attr::HLSLSV_GroupIndex:
+  case attr::HLSLSV_GroupID:
     if (ST == llvm::Triple::Compute)
       return;
     DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Compute});
@@ -764,26 +765,36 @@ void SemaHLSL::handleWaveSizeAttr(Decl *D, const ParsedAttr &AL) {
     D->addAttr(NewAttr);
 }
 
-static bool isLegalTypeForHLSLSV_DispatchThreadID(QualType T) {
-  if (!T->hasUnsignedIntegerRepresentation())
+bool SemaHLSL::diagnoseInputIDType(QualType T, const ParsedAttr &AL) {
+  const auto *VT = T->getAs<VectorType>();
+
+  if (!T->hasUnsignedIntegerRepresentation() ||
+      (VT && VT->getNumElements() > 3)) {
+    Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_type)
+        << AL << "uint/uint2/uint3";
     return false;
-  if (const auto *VT = T->getAs<VectorType>())
-    return VT->getNumElements() <= 3;
+  }
+
   return true;
 }
 
 void SemaHLSL::handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL) {
   auto *VD = cast<ValueDecl>(D);
-  if (!isLegalTypeForHLSLSV_DispatchThreadID(VD->getType())) {
-    Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_type)
-        << AL << "uint/uint2/uint3";
+  if (!diagnoseInputIDType(VD->getType(), AL))
     return;
-  }
 
   D->addAttr(::new (getASTContext())
                  HLSLSV_DispatchThreadIDAttr(getASTContext(), AL));
 }
 
+void SemaHLSL::handleSV_GroupIDAttr(Decl *D, const ParsedAttr &AL) {
+  auto *VD = cast<ValueDecl>(D);
+  if (!diagnoseInputIDType(VD->getType(), AL))
+    return;
+
+  D->addAttr(::new (getASTContext()) HLSLSV_GroupIDAttr(getASTContext(), AL));
+}
+
 void SemaHLSL::handlePackOffsetAttr(Decl *D, const ParsedAttr &AL) {
   if (!isa<VarDecl>(D) || !isa<HLSLBufferDecl>(D->getDeclContext())) {
     Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_ast_node)
diff --git a/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl
new file mode 100644
index 00000000000000..5e09f0fe06d4e6
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/SV_GroupID.hlsl
@@ -0,0 +1,32 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s
+
+// Make sure SV_GroupID translated into dx.group.id.
+
+// CHECK:  define void @foo()
+// CHECK:  %[[#ID:]] = call i32 @llvm.dx.group.id(i32 0)
+// CHECK:  call void @{{.*}}foo{{.*}}(i32 %[[#ID]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void foo(uint Idx : SV_GroupID) {}
+
+// CHECK:  define void @bar()
+// CHECK:  %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0)
+// CHECK:  %[[#ID_X_:]] = insertelement <2 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK:  %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1)
+// CHECK:  %[[#ID_XY:]] = insertelement <2 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK:  call void @{{.*}}bar{{.*}}(<2 x i32> %[[#ID_XY]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void bar(uint2 Idx : SV_GroupID) {}
+
+// CHECK:  define void @test()
+// CHECK:  %[[#ID_X:]] = call i32 @llvm.dx.group.id(i32 0)
+// CHECK:  %[[#ID_X_:]] = insertelement <3 x i32> poison, i32 %[[#ID_X]], i64 0
+// CHECK:  %[[#ID_Y:]] = call i32 @llvm.dx.group.id(i32 1)
+// CHECK:  %[[#ID_XY:]] = insertelement <3 x i32> %[[#ID_X_]], i32 %[[#ID_Y]], i64 1
+// CHECK:  %[[#ID_Z:]] = call i32 @llvm.dx.group.id(i32 2)
+// CHECK:  %[[#ID_XYZ:]] = insertelement <3 x i32> %[[#ID_XY]], i32 %[[#ID_Z]], i64 2
+// CHECK:  call void @{{.*}}test{{.*}}(<3 x i32> %[[#ID_XYZ]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void test(uint3 Idx : SV_GroupID) {}
diff --git a/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl b/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl
index 8484259f84692b..13c07038d2e4a4 100644
--- a/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl
+++ b/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl
@@ -2,12 +2,15 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-mesh -hlsl-entry CSMain -x hlsl -finclude-default-header  -verify -o - %s
 
 [numthreads(8,8,1)]
-// expected-error@+2 {{attribute 'SV_GroupIndex' is unsupported in 'mesh' shaders, requires compute}}
-// expected-error@+1 {{attribute 'SV_DispatchThreadID' is unsupported in 'mesh' shaders, requires compute}}
-void CSMain(int GI : SV_GroupIndex, uint ID : SV_DispatchThreadID) {
-// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:6 CSMain 'void (int, uint)'
+// expected-error@+3 {{attribute 'SV_GroupIndex' is unsupported in 'mesh' shaders, requires compute}}
+// expected-error@+2 {{attribute 'SV_DispatchThreadID' is unsupported in 'mesh' shaders, requires compute}}
+// expected-error@+1 {{attribute 'SV_GroupID' is unsupported in 'mesh' shaders, requires compute}}
+void CSMain(int GI : SV_GroupIndex, uint ID : SV_DispatchThreadID, uint GID : SV_GroupID) {
+// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:6 CSMain 'void (int, uint, uint)'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:17 GI 'int'
 // CHECK-NEXT: HLSLSV_GroupIndexAttr
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:42 ID 'uint'
 // CHECK-NEXT: HLSLSV_DispatchThreadIDAttr
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:73 GID 'uint'
+// CHECK-NEXT: HLSLSV_GroupIDAttr
 }
diff --git a/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl b/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl
index bc3cf8bc51daf4..4e1f88aa2294b5 100644
--- a/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl
+++ b/clang/test/SemaHLSL/Semantics/invalid_entry_parameter.hlsl
@@ -27,3 +27,25 @@ struct ST2 {
     static uint X : SV_DispatchThreadID;
     uint s : SV_DispatchThreadID;
 };
+
+[numthreads(8,8,1)]
+// expected-error@+1 {{attribute 'SV_GroupID' only applies to a field or parameter of type 'uint/uint2/uint3'}}
+void CSMain_GID(float ID : SV_GroupID) {
+}
+
+[numthreads(8,8,1)]
+// expected-error@+1 {{attribute 'SV_GroupID' only applies to a field or parameter of type 'uint/uint2/uint3'}}
+void CSMain2_GID(ST GID : SV_GroupID) {
+
+}
+
+void foo_GID() {
+// expected-warning@+1 {{'SV_GroupID' attribute only applies to parameters and non-static data members}}
+  uint GIS : SV_GroupID;
+}
+
+struct ST2_GID {
+// expected-warning@+1 {{'SV_GroupID' attribute only applies to parameters and non-static data members}}
+    static uint GID : SV_GroupID;
+    uint s_gid : SV_GroupID;
+};
diff --git a/clang/test/SemaHLSL/Semantics/valid_entry_parameter.hlsl b/clang/test/SemaHLSL/Semantics/valid_entry_parameter.hlsl
index 8e79fc4d85ec91..10a5e5dabac87b 100644
--- a/clang/test/SemaHLSL/Semantics/valid_entry_parameter.hlsl
+++ b/clang/test/SemaHLSL/Semantics/valid_entry_parameter.hlsl
@@ -24,3 +24,28 @@ void CSMain3(uint3 : SV_DispatchThreadID) {
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:20 'uint3'
 // CHECK-NEXT: HLSLSV_DispatchThreadIDAttr
 }
+
+[numthreads(8,8,1)]
+void CSMain_GID(uint ID : SV_GroupID) {
+// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:6 CSMain_GID 'void (uint)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:22 ID 'uint'
+// CHECK-NEXT: HLSLSV_GroupIDAttr
+}
+[numthreads(8,8,1)]
+void CSMain1_GID(uint2 ID : SV_GroupID) {
+// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:6 CSMain1_GID 'void (uint2)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:24 ID 'uint2'
+// CHECK-NEXT: HLSLSV_GroupIDAttr
+}
+[numthreads(8,8,1)]
+void CSMain2_GID(uint3 ID : SV_GroupID) {
+// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:6 CSMain2_GID 'void (uint3)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:24 ID 'uint3'
+// CHECK-NEXT: HLSLSV_GroupIDAttr
+}
+[numthreads(8,8,1)]
+void CSMain3_GID(uint3 : SV_GroupID) {
+// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:6 CSMain3_GID 'void (uint3)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:24 'uint3'
+// CHECK-NEXT: HLSLSV_GroupIDAttr
+}

From 78c7024640a5b511685c445f554b7d985a7cf286 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Tue, 26 Nov 2024 09:48:27 -0800
Subject: [PATCH 36/41] [OpenACC] Implement 'present' for combined constructs.

This is another clause where the parsing does all the required
enforcement besides the construct it appertains to, so this patch
removes the restriction and adds sufficient test coverage for combined
constructs.
---
 clang/lib/Sema/SemaOpenACC.cpp                |   9 +-
 .../ast-print-openacc-combined-construct.cpp  |   9 ++
 ...d-construct-auto_seq_independent-clauses.c |  12 --
 .../combined-construct-default-clause.c       |   3 +-
 .../combined-construct-present-ast.cpp        |  78 ++++++++++++
 .../combined-construct-present-clause.c       |  58 +++++++++
 .../combined-construct-present-clause.cpp     | 112 ++++++++++++++++++
 7 files changed, 263 insertions(+), 18 deletions(-)
 create mode 100644 clang/test/SemaOpenACC/combined-construct-present-ast.cpp
 create mode 100644 clang/test/SemaOpenACC/combined-construct-present-clause.c
 create mode 100644 clang/test/SemaOpenACC/combined-construct-present-clause.cpp

diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 76be9a2abf5e46..d146edeabab741 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -859,10 +859,11 @@ OpenACCClause *SemaOpenACCClauseVisitor::VisitNoCreateClause(
 
 OpenACCClause *SemaOpenACCClauseVisitor::VisitPresentClause(
     SemaOpenACC::OpenACCParsedClause &Clause) {
-  // Restrictions only properly implemented on 'compute' constructs, and
-  // 'compute' constructs are the only construct that can do anything with
-  // this yet, so skip/treat as unimplemented in this case.
-  if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()))
+  // Restrictions only properly implemented on 'compute'/'combined constructs,
+  // and 'compute'/'combined' constructs are the only construct that can do
+  // anything with this yet, so skip/treat as unimplemented in this case.
+  if (!isOpenACCComputeDirectiveKind(Clause.getDirectiveKind()) &&
+      !isOpenACCCombinedDirectiveKind(Clause.getDirectiveKind()))
     return isNotImplemented();
   // ActOnVar ensured that everything is a valid variable reference, so there
   // really isn't anything to do here. GCC does some duplicate-finding, though
diff --git a/clang/test/AST/ast-print-openacc-combined-construct.cpp b/clang/test/AST/ast-print-openacc-combined-construct.cpp
index 6885806584f3d1..e04c39ac9bc5be 100644
--- a/clang/test/AST/ast-print-openacc-combined-construct.cpp
+++ b/clang/test/AST/ast-print-openacc-combined-construct.cpp
@@ -122,4 +122,13 @@ void foo() {
 #pragma acc kernels loop async
   for(int i = 0;i<5;++i);
 
+// CHECK: #pragma acc parallel loop present(i, array[1], array, array[1:2])
+#pragma acc parallel loop present(i, array[1], array, array[1:2])
+  for(int i = 0;i<5;++i);
+// CHECK: #pragma acc serial loop present(i, array[1], array, array[1:2])
+#pragma acc serial loop present(i, array[1], array, array[1:2])
+  for(int i = 0;i<5;++i);
+// CHECK: #pragma acc kernels loop present(i, array[1], array, array[1:2])
+#pragma acc kernels loop present(i, array[1], array, array[1:2])
+  for(int i = 0;i<5;++i);
 }
diff --git a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
index a770020764d356..69f93a6c605156 100644
--- a/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
+++ b/clang/test/SemaOpenACC/combined-construct-auto_seq_independent-clauses.c
@@ -111,8 +111,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'no_create' not yet implemented}}
 #pragma acc parallel loop auto no_create(Var)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'present' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'present' not yet implemented}}
 #pragma acc parallel loop auto present(Var)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop auto private(Var)
@@ -274,8 +272,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'no_create' not yet implemented}}
 #pragma acc parallel loop no_create(Var) auto
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'present' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'present' not yet implemented}}
 #pragma acc parallel loop present(Var) auto
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop private(Var) auto
@@ -438,8 +434,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'no_create' not yet implemented}}
 #pragma acc parallel loop independent no_create(Var)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'present' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'present' not yet implemented}}
 #pragma acc parallel loop independent present(Var)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop independent private(Var)
@@ -601,8 +595,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'no_create' not yet implemented}}
 #pragma acc parallel loop no_create(Var) independent
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'present' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'present' not yet implemented}}
 #pragma acc parallel loop present(Var) independent
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop private(Var) independent
@@ -771,8 +763,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'no_create' not yet implemented}}
 #pragma acc parallel loop seq no_create(Var)
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'present' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'present' not yet implemented}}
 #pragma acc parallel loop seq present(Var)
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop seq private(Var)
@@ -940,8 +930,6 @@ void uses() {
   // expected-warning@+1{{OpenACC clause 'no_create' not yet implemented}}
 #pragma acc parallel loop no_create(Var) seq
   for(unsigned i = 0; i < 5; ++i);
-  // TODOexpected-error@+1{{OpenACC 'present' clause is not valid on 'parallel loop' directive}}
-  // expected-warning@+1{{OpenACC clause 'present' not yet implemented}}
 #pragma acc parallel loop present(Var) seq
   for(unsigned i = 0; i < 5; ++i);
 #pragma acc parallel loop private(Var) seq
diff --git a/clang/test/SemaOpenACC/combined-construct-default-clause.c b/clang/test/SemaOpenACC/combined-construct-default-clause.c
index 646942ec700133..c420c8ebeb31d6 100644
--- a/clang/test/SemaOpenACC/combined-construct-default-clause.c
+++ b/clang/test/SemaOpenACC/combined-construct-default-clause.c
@@ -11,8 +11,7 @@ void SingleOnly() {
 
   int i;
 
-  // expected-warning@+4{{OpenACC clause 'copy' not yet implemented}}
-  // expected-warning@+3{{OpenACC clause 'present' not yet implemented}}
+  // expected-warning@+3{{OpenACC clause 'copy' not yet implemented}}
   // expected-error@+2{{OpenACC 'default' clause cannot appear more than once on a 'kernels loop' directive}}
   // expected-note@+1{{previous clause is here}}
   #pragma acc kernels loop self default(present) present(i) default(none) copy(i)
diff --git a/clang/test/SemaOpenACC/combined-construct-present-ast.cpp b/clang/test/SemaOpenACC/combined-construct-present-ast.cpp
new file mode 100644
index 00000000000000..028831c5f9899c
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-present-ast.cpp
@@ -0,0 +1,78 @@
+// RUN: %clang_cc1 %s -fopenacc -ast-dump | FileCheck %s
+
+// Test this with PCH.
+// RUN: %clang_cc1 %s -fopenacc -emit-pch -o %t %s
+// RUN: %clang_cc1 %s -fopenacc -include-pch %t -ast-dump-all | FileCheck %s
+
+#ifndef PCH_HELPER
+#define PCH_HELPER
+
+int Global;
+short GlobalArray[5];
+
+void NormalUses(float *PointerParam) {
+  // CHECK: FunctionDecl{{.*}}NormalUses
+  // CHECK: ParmVarDecl
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc parallel loop present(GlobalArray, PointerParam[Global])
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} parallel loop
+  // CHECK-NEXT: present clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'short[5]' lvalue Var{{.*}}'GlobalArray' 'short[5]'
+  // CHECK-NEXT: ArraySubscriptExpr{{.*}}'float' lvalue
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'float *' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'float *' lvalue ParmVar{{.*}}'PointerParam' 'float *'
+  // CHECK-NEXT: ImplicitCastExpr{{.*}} 'int' <LValueToRValue>
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue Var{{.*}}'Global' 'int'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+}
+
+template<auto &NTTP, typename T>
+void TemplUses(T t) {
+  // CHECK-NEXT: FunctionTemplateDecl
+  // CHECK-NEXT: NonTypeTemplateParmDecl {{.*}}referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: TemplateTypeParmDecl{{.*}}typename depth 0 index 1 T
+  // CHECK-NEXT: FunctionDecl{{.*}} TemplUses 'void (T)'
+  // CHECK-NEXT: ParmVarDecl{{.*}} referenced t 'T'
+  // CHECK-NEXT: CompoundStmt
+
+#pragma acc serial loop seq present(NTTP, t)
+  for(int i = 0; i < 5; ++i);
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} serial loop
+  // CHECK-NEXT: seq clause
+  // CHECK-NEXT: present clause
+  // CHECK-NEXT: DeclRefExpr{{.*}}'auto' lvalue NonTypeTemplateParm{{.*}} 'NTTP' 'auto &'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'T' lvalue ParmVar{{.*}} 't' 'T'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+
+  // Check the instantiated versions of the above.
+  // CHECK-NEXT: FunctionDecl{{.*}} used TemplUses 'void (int)' implicit_instantiation
+  // CHECK-NEXT: TemplateArgument decl
+  // CHECK-NEXT: Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: TemplateArgument type 'int'
+  // CHECK-NEXT: BuiltinType{{.*}} 'int'
+  // CHECK-NEXT: ParmVarDecl{{.*}} used t 'int'
+  // CHECK-NEXT: CompoundStmt
+
+// #pragma acc parallel seq present(NTTP, t)
+  // CHECK-NEXT: OpenACCCombinedConstruct{{.*}} serial loop
+  // CHECK-NEXT: seq clause
+  // CHECK-NEXT: present clause
+  // CHECK-NEXT: SubstNonTypeTemplateParmExpr{{.*}}'const unsigned int' lvalue
+  // CHECK-NEXT: NonTypeTemplateParmDecl{{.*}} referenced 'auto &' depth 0 index 0 NTTP
+  // CHECK-NEXT: DeclRefExpr{{.*}}'const unsigned int' lvalue Var{{.*}} 'CEVar' 'const unsigned int'
+  // CHECK-NEXT: DeclRefExpr{{.*}}'int' lvalue ParmVar{{.*}} 't' 'int'
+  // CHECK-NEXT: ForStmt
+  // CHECK: NullStmt
+
+}
+
+void Inst() {
+  static constexpr unsigned CEVar = 1;
+  TemplUses<CEVar>(5);
+}
+#endif
diff --git a/clang/test/SemaOpenACC/combined-construct-present-clause.c b/clang/test/SemaOpenACC/combined-construct-present-clause.c
new file mode 100644
index 00000000000000..acdaaa33929233
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-present-clause.c
@@ -0,0 +1,58 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  void *PointerMember;
+} Complete;
+void uses(int IntParam, short *PointerParam, float ArrayParam[5], Complete CompositeParam) {
+  int LocalInt;
+  short *LocalPointer;
+  float LocalArray[5];
+  Complete LocalComposite;
+  // Check Appertainment:
+#pragma acc parallel loop present(LocalInt)
+  for(int i = 5; i < 10;++i);
+#pragma acc serial loop present(LocalInt)
+  for(int i = 5; i < 10;++i);
+#pragma acc kernels loop present(LocalInt)
+  for(int i = 5; i < 10;++i);
+
+  // Valid cases:
+#pragma acc parallel loop present(LocalInt, LocalPointer, LocalArray)
+  for(int i = 5; i < 10;++i);
+#pragma acc parallel loop present(LocalArray[2:1])
+  for(int i = 5; i < 10;++i);
+
+#pragma acc parallel loop present(LocalComposite.ScalarMember, LocalComposite.ScalarMember)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present(1 + IntParam)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present(+IntParam)
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel loop present(PointerParam[2:])
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel loop present(ArrayParam[2:5])
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present((float*)ArrayParam[2:5])
+  for(int i = 5; i < 10;++i);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present((float)ArrayParam[2])
+  for(int i = 5; i < 10;++i);
+
+  // expected-error@+1{{OpenACC 'present' clause is not valid on 'loop' directive}}
+#pragma acc loop present(LocalInt)
+  for(int i = 5; i < 10;++i);
+}
diff --git a/clang/test/SemaOpenACC/combined-construct-present-clause.cpp b/clang/test/SemaOpenACC/combined-construct-present-clause.cpp
new file mode 100644
index 00000000000000..814acf25c01c02
--- /dev/null
+++ b/clang/test/SemaOpenACC/combined-construct-present-clause.cpp
@@ -0,0 +1,112 @@
+// RUN: %clang_cc1 %s -fopenacc -verify
+
+enum SomeE{};
+typedef struct IsComplete {
+  struct S { int A; } CompositeMember;
+  int ScalarMember;
+  float ArrayMember[5];
+  SomeE EnumMember;
+  char *PointerMember;
+} Complete;
+
+void uses(int IntParam, char *PointerParam, float ArrayParam[5], Complete CompositeParam, int &IntParamRef) {
+  int LocalInt;
+  char *LocalPointer;
+  float LocalArray[5];
+  // Check Appertainment:
+#pragma acc parallel loop present(LocalInt)
+  for(unsigned I = 0; I < 5; ++I);
+#pragma acc serial loop present(LocalInt)
+  for(unsigned I = 0; I < 5; ++I);
+#pragma acc kernels loop present(LocalInt)
+  for(unsigned I = 0; I < 5; ++I);
+
+  // Valid cases:
+#pragma acc parallel loop present(LocalInt, LocalPointer, LocalArray)
+  for(unsigned I = 0; I < 5; ++I);
+#pragma acc parallel loop present(LocalArray[2:1])
+  for(unsigned I = 0; I < 5; ++I);
+
+  Complete LocalComposite2;
+#pragma acc parallel loop present(LocalComposite2.ScalarMember, LocalComposite2.ScalarMember)
+  for(unsigned I = 0; I < 5; ++I);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present(1 + IntParam)
+  for(unsigned I = 0; I < 5; ++I);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present(+IntParam)
+  for(unsigned I = 0; I < 5; ++I);
+
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel loop present(PointerParam[2:])
+  for(unsigned I = 0; I < 5; ++I);
+
+  // expected-error@+1{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+#pragma acc parallel loop present(ArrayParam[2:5])
+  for(unsigned I = 0; I < 5; ++I);
+
+  // expected-error@+2{{OpenACC sub-array specified range [2:5] would be out of the range of the subscripted array size of 5}}
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present((float*)ArrayParam[2:5])
+  for(unsigned I = 0; I < 5; ++I);
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present((float)ArrayParam[2])
+  for(unsigned I = 0; I < 5; ++I);
+}
+
+template<typename T, unsigned Int, typename V>
+void TemplUses(T t, T (&arrayT)[Int], V TemplComp) {
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present(+t)
+  for(unsigned I = 0; I < 5; ++I);
+
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+  // expected-note@#TEMPL_USES_INST{{in instantiation of}}
+#pragma acc parallel loop present(Int)
+  for(unsigned I = 0; I < 5; ++I);
+
+  // expected-error@+1{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+#pragma acc parallel loop present(t, Int)
+  for(unsigned I = 0; I < 5; ++I);
+
+#pragma acc parallel loop present(arrayT)
+  for(unsigned I = 0; I < 5; ++I);
+
+#pragma acc parallel loop present(TemplComp)
+  for(unsigned I = 0; I < 5; ++I);
+
+#pragma acc parallel loop present(TemplComp.PointerMember[5])
+  for(unsigned I = 0; I < 5; ++I);
+ int *Pointer;
+#pragma acc parallel loop present(Pointer[:Int])
+  for(unsigned I = 0; I < 5; ++I);
+#pragma acc parallel loop present(Pointer[:t])
+  for(unsigned I = 0; I < 5; ++I);
+  // expected-error@+1{{OpenACC sub-array length is unspecified and cannot be inferred because the subscripted value is not an array}}
+#pragma acc parallel loop present(Pointer[1:])
+  for(unsigned I = 0; I < 5; ++I);
+}
+
+template<unsigned Int, auto &NTTP_REF>
+void NTTP() {
+  // NTTP's are only valid if it is a reference to something.
+  // expected-error@+2{{OpenACC variable is not a valid variable name, sub-array, array element, member of a composite variable, or composite variable member}}
+  // expected-note@#NTTP_INST{{in instantiation of}}
+#pragma acc parallel loop present(Int)
+  for(unsigned I = 0; I < 5; ++I);
+
+#pragma acc parallel loop present(NTTP_REF)
+  for(unsigned I = 0; I < 5; ++I);
+}
+
+void Inst() {
+  static constexpr int NTTP_REFed = 1;
+  int i;
+  int Arr[5];
+  Complete C;
+  TemplUses(i, Arr, C); // #TEMPL_USES_INST
+  NTTP<5, NTTP_REFed>(); // #NTTP_INST
+}

From 5683fc5cc043d1f43294bc57e04a901ad8cafcf5 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 26 Nov 2024 14:05:38 -0500
Subject: [PATCH 37/41] [gn] port a5dd6463608b

---
 llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
index cf4a31a144557e..67343297fae41d 100644
--- a/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/lib/builtins/BUILD.gn
@@ -285,6 +285,7 @@ static_library("builtins") {
     if (long_double_is_80_bits) {
       sources += [
         "divxc3.c",
+        "extendhfxf2.c",
         "extendxftf2.c",
         "fixunsxfdi.c",
         "fixunsxfsi.c",

From 752ef93392b2d049064dc77087bf414d69283cfc Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Tue, 26 Nov 2024 11:09:58 -0800
Subject: [PATCH 38/41] [NFC][clang] Fix header comment in
 llvm/test/MC/AArch64/local-bounds-single-trap.ll (#117642)

llvm/test/MC/AArch64/local-bounds-single-trap.ll was introduced in
https://github.com/llvm/llvm-project/pull/65972 to demonstrate that
nomerge did not work properly, which is documented in the header
comment.

https://github.com/llvm/llvm-project/pull/101549 fixed nomerge for trap
builtins and
https://github.com/llvm/llvm-project/commit/ae6dc64ec670891cb15049277e43133d4df7fb4b
updated the test assertions, but not the header comment. This patch
updates the header comment accordingly.
---
 llvm/test/MC/AArch64/local-bounds-single-trap.ll | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llvm/test/MC/AArch64/local-bounds-single-trap.ll b/llvm/test/MC/AArch64/local-bounds-single-trap.ll
index 6a017e24ab3cdf..8b8a3e430df692 100644
--- a/llvm/test/MC/AArch64/local-bounds-single-trap.ll
+++ b/llvm/test/MC/AArch64/local-bounds-single-trap.ll
@@ -1,7 +1,8 @@
 ; RUN: llc -O3 -mtriple arm64-linux -filetype asm -o - %s | FileCheck %s -check-prefix CHECK-ASM
-; What this test does is check that even with nomerge, the functions still get merged in
-; compiled code as the ubsantrap call gets lowered to a single instruction: brk.
-
+; This test checks that nomerge correctly prevents the traps from being merged
+; in the compiled code.
+; Prior to ae6dc64ec670891cb15049277e43133d4df7fb4b, this test showed that
+; nomerge did not work correctly.
 
 @B = dso_local global [10 x i8] zeroinitializer, align 1
 @B2 = dso_local global [10 x i8] zeroinitializer, align 1

From 5bdcaf1a0804799106bc864215ba03af4b762966 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 26 Nov 2024 14:18:49 -0500
Subject: [PATCH 39/41] [github] Document the process for requesting the CI/CD
 role (#115321)

See https://discourse.llvm.org/t/rfc-proposing-a-new-ci-cd-admin-for-the-project
---
 llvm/docs/GitHub.rst | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/llvm/docs/GitHub.rst b/llvm/docs/GitHub.rst
index d785d9da9a7f48..37995969b6df38 100644
--- a/llvm/docs/GitHub.rst
+++ b/llvm/docs/GitHub.rst
@@ -433,3 +433,12 @@ will be created with the specified commits.
 If a commit you want to backport does not apply cleanly, you may resolve
 the conflicts locally and then create a pull request against the release
 branch.  Just make sure to add the release milestone to the pull request.
+
+Getting admin access to CI infrastructure
+=========================================
+
+Any individual who is responsible for setting up and/or maintaining CI infrastructure for a LLVM project can
+request to be granted the CI/CD role to the LLVM organization admins. The request can be made by creating
+`a Github issue <https://github.com/llvm/llvm-project/issues/new>`_ and using the ``infrastructure`` label.
+Applicants must include a justification for why the role is being requested. Applications are reviewed on a
+case-by-case basis by the LLVM admins and the role can be revoked at any point as the LLVM admins see fit.

From 44ef12b020c07d195519d0613d21f3b8ab29a22d Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 26 Nov 2024 14:33:56 -0500
Subject: [PATCH 40/41] [libc++] Refactor tests for hijacked address operator
 in vector (#117457)

This reduces the amount of boilerplate needed to implement the tests.
---
 .../vector/addressof.compile.pass.cpp         | 47 +++++++++++++++++++
 .../assign_copy.addressof.compile.pass.cpp    | 24 ----------
 .../assign_move.addressof.compile.pass.cpp    | 25 ----------
 .../move.addressof.compile.pass.cpp           | 32 -------------
 .../emplace.addressof.compile.pass.cpp        | 25 ----------
 .../erase_iter.addressof.compile.pass.cpp     | 23 ---------
 ...erase_iter_iter.addressof.compile.pass.cpp | 23 ---------
 ..._iter_iter_iter.addressof.compile.pass.cpp | 31 ------------
 ...ert_iter_rvalue.addressof.compile.pass.cpp | 25 ----------
 ...iter_size_value.addressof.compile.pass.cpp | 24 ----------
 ...sert_iter_value.addressof.compile.pass.cpp | 24 ----------
 .../swap.addressof.compile.pass.cpp           | 25 ----------
 12 files changed, 47 insertions(+), 281 deletions(-)
 create mode 100644 libcxx/test/std/containers/sequences/vector/addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.cons/assign_copy.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.cons/assign_move.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.cons/move.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/emplace.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_iter_iter.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_rvalue.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_size_value.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_value.addressof.compile.pass.cpp
 delete mode 100644 libcxx/test/std/containers/sequences/vector/vector.special/swap.addressof.compile.pass.cpp

diff --git a/libcxx/test/std/containers/sequences/vector/addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/addressof.compile.pass.cpp
new file mode 100644
index 00000000000000..120b7b289af93e
--- /dev/null
+++ b/libcxx/test/std/containers/sequences/vector/addressof.compile.pass.cpp
@@ -0,0 +1,47 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03 && !stdlib=libc++
+
+// <vector>
+
+// Validate various member functions of std::vector with an ADL-hijacking operator&
+
+#include <vector>
+#include <utility>
+
+#include "operator_hijacker.h"
+#include "test_iterators.h"
+
+using Vector = std::vector<operator_hijacker>;
+
+void test(
+    Vector v, Vector::const_iterator it, cpp17_input_iterator<operator_hijacker*> other_it, operator_hijacker val) {
+  // emplace / insert
+  v.emplace(it);
+  v.insert(it, it, it);
+  v.insert(it, other_it, other_it);
+  v.insert(it, operator_hijacker());
+  v.insert(it, 1, val);
+  v.insert(it, val);
+
+  // erase
+  v.erase(it);
+  v.erase(it, it);
+
+  // assignment
+  v = static_cast<Vector&>(v);
+  v = std::move(v);
+
+  // construction
+  { Vector v2(std::move(v)); }
+  { Vector v2(std::move(v), std::allocator<operator_hijacker>()); }
+
+  // swap
+  v.swap(v);
+}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/assign_copy.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/assign_copy.addressof.compile.pass.cpp
deleted file mode 100644
index ceecdfda3fa304..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/assign_copy.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// vector& operator=(const vector& c);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> vo;
-  std::vector<operator_hijacker> v;
-  v = vo;
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/assign_move.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/assign_move.addressof.compile.pass.cpp
deleted file mode 100644
index 2008c8d048f4b4..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/assign_move.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// vector& operator=(vector&& c);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-#include <utility>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> vo;
-  std::vector<operator_hijacker> v;
-  v = std::move(vo);
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.cons/move.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.cons/move.addressof.compile.pass.cpp
deleted file mode 100644
index 521b8705d49202..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.cons/move.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03 && !stdlib=libc++
-
-// <vector>
-
-// vector(vector&& c);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-#include <utility>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  {
-    std::vector<operator_hijacker> vo;
-    std::vector<operator_hijacker> v(std::move(vo));
-  }
-  {
-    std::vector<operator_hijacker> vo;
-    std::vector<operator_hijacker> v(std::move(vo), std::allocator<operator_hijacker>());
-  }
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/emplace.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/emplace.addressof.compile.pass.cpp
deleted file mode 100644
index 43e553e71e7414..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/emplace.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03 && !stdlib=libc++
-
-// <vector>
-
-// template <class... Args> iterator emplace(const_iterator pos, Args&&... args);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> v;
-  v.emplace(v.end());
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.addressof.compile.pass.cpp
deleted file mode 100644
index 0fce3498fec7e8..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// iterator erase(const_iterator position);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> v;
-  v.erase(v.end());
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.addressof.compile.pass.cpp
deleted file mode 100644
index bc90fa783e98f5..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/erase_iter_iter.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,23 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// iterator erase(const_iterator position);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> v;
-  v.erase(v.begin(), v.end());
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_iter_iter.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_iter_iter.addressof.compile.pass.cpp
deleted file mode 100644
index f8311090b37e3c..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_iter_iter.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// template <class Iter>
-//   iterator insert(const_iterator position, Iter first, Iter last);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-#include "test_iterators.h"
-
-void test(cpp17_input_iterator<operator_hijacker*> i) {
-  {
-    std::vector<operator_hijacker> v;
-    v.insert(v.end(), i, i);
-  }
-  {
-    std::vector<operator_hijacker> v;
-    v.insert(v.end(), v.begin(), v.end());
-  }
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_rvalue.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_rvalue.addressof.compile.pass.cpp
deleted file mode 100644
index 11f24604eeac4d..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_rvalue.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03 && !stdlib=libc++
-
-// <vector>
-
-// iterator insert(const_iterator position, value_type&& x);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> v;
-  v.insert(v.end(), operator_hijacker());
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_size_value.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_size_value.addressof.compile.pass.cpp
deleted file mode 100644
index c02b92a4998e85..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_size_value.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// iterator insert(const_iterator position, size_type n, const value_type& x);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> v;
-  operator_hijacker val;
-  v.insert(v.end(), 1, val);
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_value.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_value.addressof.compile.pass.cpp
deleted file mode 100644
index fbf1a4f50b974f..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.modifiers/insert_iter_value.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// iterator insert(const_iterator position, const value_type& x);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> v;
-  operator_hijacker val;
-  v.insert(v.end(), val);
-}
diff --git a/libcxx/test/std/containers/sequences/vector/vector.special/swap.addressof.compile.pass.cpp b/libcxx/test/std/containers/sequences/vector/vector.special/swap.addressof.compile.pass.cpp
deleted file mode 100644
index 4e908d9ff6eaca..00000000000000
--- a/libcxx/test/std/containers/sequences/vector/vector.special/swap.addressof.compile.pass.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// <vector>
-
-// template <class T, class Alloc>
-//   void swap(vector<T,Alloc>& x, vector<T,Alloc>& y);
-
-// Validate whether the container can be copy-assigned with an ADL-hijacking operator&
-
-#include <vector>
-
-#include "test_macros.h"
-#include "operator_hijacker.h"
-
-void test() {
-  std::vector<operator_hijacker> vo;
-  std::vector<operator_hijacker> v;
-  v.swap(vo);
-}

From e57b327be27bd185595a3383dfac90ec6651c123 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 26 Nov 2024 14:44:09 -0500
Subject: [PATCH 41/41] AMDGPU: Legalize fminimum and fmaximum f32 for gfx950
 (#117634)

Select to minimum3/maximum3. Leave f16/v2f16 for later
since it's complicated by only having the vector version.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp    |    4 +
 llvm/lib/Target/AMDGPU/VOP3Instructions.td   |   17 +
 llvm/test/CodeGen/AMDGPU/fmaximum3.ll        | 1368 +++++++++++-------
 llvm/test/CodeGen/AMDGPU/fminimum3.ll        | 1368 +++++++++++-------
 llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll |  421 +++---
 llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll |  421 +++---
 6 files changed, 1977 insertions(+), 1622 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 3f0845864336fe..2e0f95161935a9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -855,6 +855,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction({ISD::FMINIMUM, ISD::FMAXIMUM},
                        {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
                        Custom);
+  } else {
+    // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
+    if (Subtarget->hasMinimum3Maximum3F32())
+      setOperationAction({ISD::FMAXIMUM, ISD::FMINIMUM}, MVT::f32, Legal);
   }
 
   setOperationAction(ISD::INTRINSIC_WO_CHAIN,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 5d4d56e8b0ad22..2b207e008581b3 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1234,6 +1234,23 @@ def : IntClampPat<V_MQSAD_PK_U16_U8_e64, int_amdgcn_mqsad_pk_u16_u8>;
 def : IntClampPat<V_QSAD_PK_U16_U8_e64, int_amdgcn_qsad_pk_u16_u8>;
 def : IntClampPat<V_MQSAD_U32_U8_e64, int_amdgcn_mqsad_u32_u8>;
 
+//===----------------------------------------------------------------------===//
+// Floating-point operation Patterns
+//===----------------------------------------------------------------------===//
+
+// Implement fminimum(x, y) by using minimum3(x, y, y)
+class MinimumMaximumByMinimum3Maximum3<SDPatternOperator node, ValueType vt,
+                                       Instruction inst> : GCNPat<
+  (vt (node (VOP3Mods vt:$src0, i32:$src0_mods), (VOP3Mods vt:$src1, i32:$src1_mods))),
+  (inst $src0_mods, $src0, $src1_mods, $src1, $src1_mods, $src1)
+>;
+
+// Prefer the real 2 operand form if legal
+let SubtargetPredicate = HasMinimum3Maximum3F32, AddedComplexity = -1000 in {
+def : MinimumMaximumByMinimum3Maximum3<fminimum, f32, V_MINIMUM3_F32_e64>;
+def : MinimumMaximumByMinimum3Maximum3<fmaximum, f32, V_MAXIMUM3_F32_e64>;
+}
+
 //===----------------------------------------------------------------------===//
 // Target-specific instruction encodings.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
index 08122cd0d89eab..209ae86b4dedce 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum3.ll
@@ -14,19 +14,26 @@ define float @v_fmaximum3_f32(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   ret float %max1
@@ -43,19 +50,26 @@ define float @v_fmaximum3_f32_commute(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_commute:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, v2, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v2, v0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %c, float %max0)
   ret float %max1
@@ -70,21 +84,30 @@ define amdgpu_ps i32 @s_fmaximum3_f32(float inreg %a, float inreg %b, float inre
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_fmaximum3_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX940-LABEL: s_fmaximum3_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX940-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_fmaximum3_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s1, s1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s2, s2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   %cast = bitcast float %max1 to i32
@@ -103,19 +126,26 @@ define float @v_fmaximum3_f32_fabs0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fabs0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v3, |v0|, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, |v0|, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %max0 = call float @llvm.maximum.f32(float %a.fabs, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -133,19 +163,26 @@ define float @v_fmaximum3_f32_fabs1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fabs1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v3, v0, |v1|
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, |v1|, |v1|
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %max0 = call float @llvm.maximum.f32(float %a, float %b.fabs)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -163,19 +200,26 @@ define float @v_fmaximum3_f32_fabs2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fabs2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, |v2|, |v2|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call float @llvm.fabs.f32(float %c)
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c.fabs)
@@ -193,19 +237,26 @@ define float @v_fmaximum3_f32_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v3, |v0|, |v1|
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v1, v0, |v2|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, |v0|, |v1|, |v1|
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, |v2|, |v2|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -225,19 +276,26 @@ define float @v_fmaximum3_f32_fneg_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v3, -v0, -v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, -v0, -v1, -v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, -v2, -v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %b.fneg = fneg float %b
   %c.fneg = fneg float %c
@@ -257,19 +315,26 @@ define float @v_fmaximum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fneg_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v1, v0, -|v2|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v3, -|v0|, -|v1|
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v1, v0, -|v2|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, -|v0|, -|v1|, -|v1|
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, -|v2|, -|v2|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -292,19 +357,26 @@ define float @v_fmaximum3_f32_fneg0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fneg0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v3, -v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v3, -v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, -v0, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %max0 = call float @llvm.maximum.f32(float %a.fneg, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -322,19 +394,26 @@ define float @v_fmaximum3_f32_fneg1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fneg1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v3, v0, -v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, -v1, -v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg float %b
   %max0 = call float @llvm.maximum.f32(float %a, float %b.fneg)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
@@ -352,19 +431,26 @@ define float @v_fmaximum3_f32_fneg2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_fneg2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_fneg2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v1, v0, -v2
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_fneg2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, -v2, -v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg float %c
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c.fneg)
@@ -382,19 +468,27 @@ define float @v_fmaximum3_f32_const0(float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, 0x41000000, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_const0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v2, 0x41000000, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_const0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, 0x41000000, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_const0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float 8.0, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   ret float %max1
@@ -411,19 +505,27 @@ define float @v_fmaximum3_f32__const2(float %a, float %b) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 0x41000000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32__const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32__const2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32__const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float 8.0)
   ret float %max1
@@ -440,19 +542,26 @@ define float @v_fmaximum3_f32_inlineimm0(float %b, float %c) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_inlineimm0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, 4.0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_inlineimm0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float 4.0, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   ret float %max1
@@ -469,19 +578,26 @@ define float @v_fmaximum3_f32__inlineimm(float %a, float %b) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32__inlineimm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, 4.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32__inlineimm:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float 4.0)
   ret float %max1
@@ -500,19 +616,28 @@ define float @v_fmaximum3_f32_const1_const2(float %a) {
 ; GFX12-NEXT:    v_maximum3_f32 v0, v0, s0, 0x41800000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_f32_const1_const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, 0x41800000, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v1, 0x41000000, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, 0x41800000, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_f32_const1_const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    s_mov_b32 s0, 0x41800000
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float 8.0)
   %max1 = call float @llvm.maximum.f32(float %max0, float 16.0)
   ret float %max1
@@ -530,27 +655,36 @@ define <2 x float> @v_fmaximum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
 ; GFX12-NEXT:    v_maximum3_f32 v1, v5, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v6, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v4, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v5, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v6, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, v4, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, v5, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v0, v4, v0, v0
+; GFX950-NEXT:    v_maximum3_f32 v1, v5, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %c, <2 x float> %max0)
   ret <2 x float> %max1
@@ -568,27 +702,36 @@ define <2 x float> @v_fmaximum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
 ; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v2f32_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v6, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v0, v4
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f32_commute:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v6, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, v0, v4
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v5
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f32_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -606,27 +749,36 @@ define <2 x float> @v_fmaximum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v1, |v1|, |v3|, |v5|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v2f32__fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v6, |v1|, |v3|
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v3|
-; GFX9-NEXT:    v_max_f32_e64 v3, |v0|, |v2|
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v2, v0, |v4|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_max_f32_e64 v2, v1, |v5|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v6, |v1|, |v3|
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v3|
+; GFX940-NEXT:    v_max_f32_e64 v3, |v0|, |v2|
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v2, v0, |v4|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_max_f32_e64 v2, v1, |v5|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f32__fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v1, |v1|, |v3|, |v3|
+; GFX950-NEXT:    v_maximum3_f32 v0, |v0|, |v2|, |v2|
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, |v4|, |v4|
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, |v5|, |v5|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
   %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
   %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
@@ -647,27 +799,36 @@ define <2 x float> @v_fmaximum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v1, -v1, -v3, -v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v2f32__fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v6, -v1, -v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v3
-; GFX9-NEXT:    v_max_f32_e64 v3, -v0, -v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v2, v0, -v4
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_max_f32_e64 v2, v1, -v5
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v6, -v1, -v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v3
+; GFX940-NEXT:    v_max_f32_e64 v3, -v0, -v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v2, v0, -v4
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_max_f32_e64 v2, v1, -v5
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f32__fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v1, -v1, -v3, -v3
+; GFX950-NEXT:    v_maximum3_f32 v0, -v0, -v2, -v2
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, -v4, -v4
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, -v5, -v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x float> %a
   %b.fneg = fneg <2 x float> %b
   %c.fneg = fneg <2 x float> %c
@@ -688,27 +849,36 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
 ; GFX12-NEXT:    v_maximum3_f32 v1, v1, 2.0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, 2.0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, 2.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v4, 2.0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_max_f32_e32 v4, 2.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f32__inlineimm1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, 2.0, 2.0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, 2.0, 2.0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -726,27 +896,36 @@ define <2 x float> @v_fmaximum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
 ; GFX12-NEXT:    v_maximum3_f32 v1, v1, v3, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v2f32__inlineimm2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    v_max_f32_e32 v2, 4.0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, 4.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX940-NEXT:    v_max_f32_e32 v2, 4.0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v2f32__inlineimm2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.maximum.v2f32(<2 x float> %max0, <2 x float> <float 4.0, float 4.0>)
   ret <2 x float> %max1
@@ -765,35 +944,46 @@ define <3 x float> @v_fmaximum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
 ; GFX12-NEXT:    v_maximum3_f32 v2, v8, v2, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v3f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v9, v2, v5
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_max_f32_e32 v5, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v6, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v7, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v8, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v3f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v9, v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    v_max_f32_e32 v5, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, v6, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, v7, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, v8, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v3f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v0, v6, v0, v0
+; GFX950-NEXT:    v_maximum3_f32 v1, v7, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v2, v8, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %c, <3 x float> %max0)
   ret <3 x float> %max1
@@ -812,35 +1002,46 @@ define <3 x float> @v_fmaximum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
 ; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v3f32_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v9, v2, v5
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_max_f32_e32 v5, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v6
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v1, v7
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v2, v8
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v3f32_commute:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v9, v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    v_max_f32_e32 v5, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v6
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, v1, v7
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, v2, v8
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v3f32_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v6, v6
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v7, v7
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v8, v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -859,35 +1060,46 @@ define <3 x float> @v_fmaximum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v2, |v2|, |v5|, |v8|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v3f32__fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v9, |v2|, |v5|
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v2|, |v5|
-; GFX9-NEXT:    v_max_f32_e64 v5, |v1|, |v4|
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v4|
-; GFX9-NEXT:    v_max_f32_e64 v4, |v0|, |v3|
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v3|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_max_f32_e64 v3, v0, |v6|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v3, v1, |v7|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v3, v2, |v8|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v9, |v2|, |v5|
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v2|, |v5|
+; GFX940-NEXT:    v_max_f32_e64 v5, |v1|, |v4|
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v4|
+; GFX940-NEXT:    v_max_f32_e64 v4, |v0|, |v3|
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v3|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_max_f32_e64 v3, v0, |v6|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v3, v1, |v7|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v3, v2, |v8|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v3f32__fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v2, |v2|, |v5|, |v5|
+; GFX950-NEXT:    v_maximum3_f32 v1, |v1|, |v4|, |v4|
+; GFX950-NEXT:    v_maximum3_f32 v0, |v0|, |v3|, |v3|
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, |v6|, |v6|
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, |v7|, |v7|
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, |v8|, |v8|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
   %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
   %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
@@ -909,35 +1121,46 @@ define <3 x float> @v_fmaximum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_maximum3_f32 v2, -v2, -v5, -v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v3f32__fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e64 v9, -v2, -v5
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v2, -v5
-; GFX9-NEXT:    v_max_f32_e64 v5, -v1, -v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v4
-; GFX9-NEXT:    v_max_f32_e64 v4, -v0, -v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_max_f32_e64 v3, v0, -v6
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v3, v1, -v7
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_max_f32_e64 v3, v2, -v8
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e64 v9, -v2, -v5
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v2, -v5
+; GFX940-NEXT:    v_max_f32_e64 v5, -v1, -v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v4
+; GFX940-NEXT:    v_max_f32_e64 v4, -v0, -v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_max_f32_e64 v3, v0, -v6
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v3, v1, -v7
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_max_f32_e64 v3, v2, -v8
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v3f32__fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v2, -v2, -v5, -v5
+; GFX950-NEXT:    v_maximum3_f32 v1, -v1, -v4, -v4
+; GFX950-NEXT:    v_maximum3_f32 v0, -v0, -v3, -v3
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, -v6, -v6
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, -v7, -v7
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, -v8, -v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x float> %a
   %b.fneg = fneg <3 x float> %b
   %c.fneg = fneg <3 x float> %c
@@ -959,35 +1182,46 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
 ; GFX12-NEXT:    v_maximum3_f32 v2, v2, 2.0, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v6, 2.0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
-; GFX9-NEXT:    v_max_f32_e32 v6, 2.0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_max_f32_e32 v6, 2.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v6, 2.0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX940-NEXT:    v_max_f32_e32 v6, 2.0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_max_f32_e32 v6, 2.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX940-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v3f32__inlineimm1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, 2.0, 2.0
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, 2.0, 2.0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, 2.0, 2.0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -1006,35 +1240,46 @@ define <3 x float> @v_fmaximum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 ; GFX12-NEXT:    v_maximum3_f32 v2, v2, v5, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fmaximum3_v3f32__inlineimm2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v6, v2, v5
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_max_f32_e32 v5, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_max_f32_e32 v4, v0, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, 4.0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v3, 4.0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v6, v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    v_max_f32_e32 v5, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    v_max_f32_e32 v4, v0, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, 4.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, 4.0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v3, 4.0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fmaximum3_v3f32__inlineimm2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, 4.0, 4.0
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.maximum.v3f32(<3 x float> %max0, <3 x float> <float 4.0, float 4.0, float 4.0>)
   ret <3 x float> %max1
@@ -3165,19 +3410,26 @@ define <2 x float> @v_no_fmaximum3_f32__multi_use(float %a, float %b, float %c)
 ; GFX12-NEXT:    v_maximum_f32 v1, v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_no_fmaximum3_f32__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_no_fmaximum3_f32__multi_use:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_max_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_no_fmaximum3_f32__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_maximum3_f32 v1, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   %insert.0 = insertelement <2 x float> poison, float %max0, i32 0
@@ -3193,22 +3445,31 @@ define amdgpu_ps <2 x i32> @s_no_fmaximum3_f32__multi_use(float inreg %a, float
 ; GFX12-NEXT:    s_maximum_f32 s1, s0, s2
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_no_fmaximum3_f32__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_max_f32_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX940-LABEL: s_no_fmaximum3_f32__multi_use:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_max_f32_e32 v1, s2, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX940-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_no_fmaximum3_f32__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s1, s1
+; GFX950-NEXT:    v_maximum3_f32 v1, v0, s2, s2
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.maximum.f32(float %a, float %b)
   %max1 = call float @llvm.maximum.f32(float %max0, float %c)
   %cast0 = bitcast float %max0 to i32
@@ -3372,6 +3633,3 @@ define <2 x double> @v_no_fmaximum3_f64__multi_use(double %a, double %b, double
   %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
   ret <2 x double> %insert.1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX940: {{.*}}
-; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum3.ll b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
index 43293512c8c21d..000f6c190b9773 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum3.ll
@@ -14,19 +14,26 @@ define float @v_fminimum3_f32(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   ret float %max1
@@ -43,19 +50,26 @@ define float @v_fminimum3_f32_commute(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v2, v0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, v2, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_commute:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, v2, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v2, v0, v0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %c, float %max0)
   ret float %max1
@@ -70,21 +84,30 @@ define amdgpu_ps i32 @s_fminimum3_f32(float inreg %a, float inreg %b, float inre
 ; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_fminimum3_f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX940-LABEL: s_fminimum3_f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, s2, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX940-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_fminimum3_f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s1, s1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s2, s2
+; GFX950-NEXT:    s_nop 0
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   %cast = bitcast float %max1 to i32
@@ -103,19 +126,26 @@ define float @v_fminimum3_f32_fabs0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fabs0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fabs0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v3, |v0|, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fabs0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, |v0|, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %max0 = call float @llvm.minimum.f32(float %a.fabs, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -133,19 +163,26 @@ define float @v_fminimum3_f32_fabs1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, |v1|, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fabs1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v3, v0, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fabs1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v3, v0, |v1|
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v1|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fabs1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, |v1|, |v1|
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %max0 = call float @llvm.minimum.f32(float %a, float %b.fabs)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -163,19 +200,26 @@ define float @v_fminimum3_f32_fabs2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fabs2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fabs2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fabs2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, |v2|, |v2|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fabs = call float @llvm.fabs.f32(float %c)
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c.fabs)
@@ -193,19 +237,26 @@ define float @v_fminimum3_f32_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, |v0|, |v1|, |v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, |v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v1, v0, |v2|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v3, |v0|, |v1|
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v1|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v1, v0, |v2|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, |v0|, |v1|, |v1|
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, |v2|, |v2|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -225,19 +276,26 @@ define float @v_fminimum3_f32_fneg_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, -v0, -v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v3, -v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fneg_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v3, -v0, -v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, -v0, -v1, -v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, -v2, -v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %b.fneg = fneg float %b
   %c.fneg = fneg float %c
@@ -257,19 +315,26 @@ define float @v_fminimum3_f32_fneg_fabs_all(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, -|v0|, -|v1|, -|v2|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fneg_fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v3, -|v0|, -|v1|
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v1, v0, -|v2|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v3, -|v0|, -|v1|
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -|v0|, -|v1|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v1, v0, -|v2|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -|v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fneg_fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, -|v0|, -|v1|, -|v1|
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, -|v2|, -|v2|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %c.fabs = call float @llvm.fabs.f32(float %c)
@@ -292,19 +357,26 @@ define float @v_fminimum3_f32_fneg0(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, -v0, v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fneg0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v3, -v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fneg0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v3, -v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fneg0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, -v0, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg float %a
   %max0 = call float @llvm.minimum.f32(float %a.fneg, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -322,19 +394,26 @@ define float @v_fminimum3_f32_fneg1(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, -v1, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fneg1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v3, v0, -v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fneg1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v3, v0, -v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fneg1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, -v1, -v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %b.fneg = fneg float %b
   %max0 = call float @llvm.minimum.f32(float %a, float %b.fneg)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
@@ -352,19 +431,26 @@ define float @v_fminimum3_f32_fneg2(float %a, float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, -v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_fneg2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v1, v0, -v2
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_fneg2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v1, v0, -v2
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_fneg2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, -v2, -v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %c.fneg = fneg float %c
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c.fneg)
@@ -382,19 +468,27 @@ define float @v_fminimum3_f32_const0(float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, 0x41000000, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_const0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v2, 0x41000000, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_const0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v2, 0x41000000, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_const0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float 8.0, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   ret float %max1
@@ -411,19 +505,27 @@ define float @v_fminimum3_f32__const2(float %a, float %b) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 0x41000000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32__const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32__const2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32__const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float 8.0)
   ret float %max1
@@ -440,19 +542,26 @@ define float @v_fminimum3_f32_inlineimm0(float %b, float %c) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, 4.0, v1
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_inlineimm0:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_inlineimm0:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float 4.0, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   ret float %max1
@@ -469,19 +578,26 @@ define float @v_fminimum3_f32__inlineimm(float %a, float %b) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, v1, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32__inlineimm:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32__inlineimm:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, 4.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32__inlineimm:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float 4.0)
   ret float %max1
@@ -500,19 +616,28 @@ define float @v_fminimum3_f32_const1_const2(float %a) {
 ; GFX12-NEXT:    v_minimum3_f32 v0, v0, s0, 0x41800000
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_f32_const1_const2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, 0x41800000, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_f32_const1_const2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v1, 0x41000000, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, 0x41800000, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_f32_const1_const2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    s_mov_b32 s0, 0x41000000
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    s_mov_b32 s0, 0x41800000
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s0, s0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float 8.0)
   %max1 = call float @llvm.minimum.f32(float %max0, float 16.0)
   ret float %max1
@@ -530,27 +655,36 @@ define <2 x float> @v_fminimum3_v2f32(<2 x float> %a, <2 x float> %b, <2 x float
 ; GFX12-NEXT:    v_minimum3_f32 v1, v5, v1, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v6, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v4, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v5, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v2f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v6, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, v4, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v4, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, v5, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v5, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v2f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v0, v4, v0, v0
+; GFX950-NEXT:    v_minimum3_f32 v1, v5, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %c, <2 x float> %max0)
   ret <2 x float> %max1
@@ -568,27 +702,36 @@ define <2 x float> @v_fminimum3_v2f32_commute(<2 x float> %a, <2 x float> %b, <2
 ; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v2f32_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v6, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v0, v4
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v2f32_commute:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v6, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, v0, v4
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, v1, v5
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v2f32_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -606,27 +749,36 @@ define <2 x float> @v_fminimum3_v2f32__fabs_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v1, |v1|, |v3|, |v5|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v2f32__fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v6, |v1|, |v3|
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v3|
-; GFX9-NEXT:    v_min_f32_e64 v3, |v0|, |v2|
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v2|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v2, v0, |v4|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_min_f32_e64 v2, v1, |v5|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v6, |v1|, |v3|
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v3|
+; GFX940-NEXT:    v_min_f32_e64 v3, |v0|, |v2|
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v2|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v2, v0, |v4|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v4|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_min_f32_e64 v2, v1, |v5|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v5|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v2f32__fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v1, |v1|, |v3|, |v3|
+; GFX950-NEXT:    v_minimum3_f32 v0, |v0|, |v2|, |v2|
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, |v4|, |v4|
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, |v5|, |v5|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %a)
   %b.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %b)
   %c.fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %c)
@@ -647,27 +799,36 @@ define <2 x float> @v_fminimum3_v2f32__fneg_all(<2 x float> %a, <2 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v1, -v1, -v3, -v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v2f32__fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v6, -v1, -v3
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v3
-; GFX9-NEXT:    v_min_f32_e64 v3, -v0, -v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v2, v0, -v4
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
-; GFX9-NEXT:    v_min_f32_e64 v2, v1, -v5
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v6, -v1, -v3
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v3
+; GFX940-NEXT:    v_min_f32_e64 v3, -v0, -v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v2, v0, -v4
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX940-NEXT:    v_min_f32_e64 v2, v1, -v5
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v2f32__fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v1, -v1, -v3, -v3
+; GFX950-NEXT:    v_minimum3_f32 v0, -v0, -v2, -v2
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, -v4, -v4
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, -v5, -v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <2 x float> %a
   %b.fneg = fneg <2 x float> %b
   %c.fneg = fneg <2 x float> %c
@@ -688,27 +849,36 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm1(<2 x float> %a, <2 x float> %c
 ; GFX12-NEXT:    v_minimum3_f32 v1, v1, 2.0, v3
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v2f32__inlineimm1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v4, 2.0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, 2.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v4, 2.0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_min_f32_e32 v4, 2.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v2f32__inlineimm1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, 2.0, 2.0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, 2.0, 2.0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> <float 2.0, float 2.0>)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> %c)
   ret <2 x float> %max1
@@ -726,27 +896,36 @@ define <2 x float> @v_fminimum3_v2f32__inlineimm2(<2 x float> %a, <2 x float> %b
 ; GFX12-NEXT:    v_minimum3_f32 v1, v1, v3, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v2f32__inlineimm2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v4, v1, v3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v2
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
-; GFX9-NEXT:    v_min_f32_e32 v2, 4.0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v4, v1, v3
+; GFX940-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v2
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, 4.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
+; GFX940-NEXT:    v_min_f32_e32 v2, 4.0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v2f32__inlineimm2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   %max1 = call <2 x float> @llvm.minimum.v2f32(<2 x float> %max0, <2 x float> <float 4.0, float 4.0>)
   ret <2 x float> %max1
@@ -765,35 +944,46 @@ define <3 x float> @v_fminimum3_v3f32(<3 x float> %a, <3 x float> %b, <3 x float
 ; GFX12-NEXT:    v_minimum3_f32 v2, v8, v2, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v3f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v9, v2, v5
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_min_f32_e32 v5, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v6, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v7, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v8, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v3f32:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v9, v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    v_min_f32_e32 v5, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, v6, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v6, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, v7, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v7, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, v8, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v8, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v3f32:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v0, v6, v0, v0
+; GFX950-NEXT:    v_minimum3_f32 v1, v7, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v2, v8, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %c, <3 x float> %max0)
   ret <3 x float> %max1
@@ -812,35 +1002,46 @@ define <3 x float> @v_fminimum3_v3f32_commute(<3 x float> %a, <3 x float> %b, <3
 ; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v3f32_commute:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v9, v2, v5
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_min_f32_e32 v5, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v6
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v1, v7
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v2, v8
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v3f32_commute:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v9, v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    v_min_f32_e32 v5, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v6
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, v1, v7
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, v2, v8
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v3f32_commute:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v6, v6
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v7, v7
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v8, v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -859,35 +1060,46 @@ define <3 x float> @v_fminimum3_v3f32__fabs_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v2, |v2|, |v5|, |v8|
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v3f32__fabs_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v9, |v2|, |v5|
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v2|, |v5|
-; GFX9-NEXT:    v_min_f32_e64 v5, |v1|, |v4|
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v4|
-; GFX9-NEXT:    v_min_f32_e64 v4, |v0|, |v3|
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v3|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_min_f32_e64 v3, v0, |v6|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v3, v1, |v7|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v3, v2, |v8|
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v9, |v2|, |v5|
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v2|, |v5|
+; GFX940-NEXT:    v_min_f32_e64 v5, |v1|, |v4|
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v1|, |v4|
+; GFX940-NEXT:    v_min_f32_e64 v4, |v0|, |v3|
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, |v0|, |v3|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_min_f32_e64 v3, v0, |v6|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, |v6|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v3, v1, |v7|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, |v7|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v3, v2, |v8|
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v2, |v8|
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v3f32__fabs_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v2, |v2|, |v5|, |v5|
+; GFX950-NEXT:    v_minimum3_f32 v1, |v1|, |v4|, |v4|
+; GFX950-NEXT:    v_minimum3_f32 v0, |v0|, |v3|, |v3|
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, |v6|, |v6|
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, |v7|, |v7|
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, |v8|, |v8|
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %a)
   %b.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %b)
   %c.fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %c)
@@ -909,35 +1121,46 @@ define <3 x float> @v_fminimum3_v3f32__fneg_all(<3 x float> %a, <3 x float> %b,
 ; GFX12-NEXT:    v_minimum3_f32 v2, -v2, -v5, -v8
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v3f32__fneg_all:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e64 v9, -v2, -v5
-; GFX9-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v2, -v5
-; GFX9-NEXT:    v_min_f32_e64 v5, -v1, -v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v4
-; GFX9-NEXT:    v_min_f32_e64 v4, -v0, -v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
-; GFX9-NEXT:    v_min_f32_e64 v3, v0, -v6
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v3, v1, -v7
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
-; GFX9-NEXT:    v_min_f32_e64 v3, v2, -v8
-; GFX9-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e64 v9, -v2, -v5
+; GFX940-NEXT:    v_mov_b32_e32 v10, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v2, -v5
+; GFX940-NEXT:    v_min_f32_e64 v5, -v1, -v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v9, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v1, -v4
+; GFX940-NEXT:    v_min_f32_e64 v4, -v0, -v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, -v0, -v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v4, vcc
+; GFX940-NEXT:    v_min_f32_e64 v3, v0, -v6
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v0, -v6
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v3, v1, -v7
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v1, -v7
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX940-NEXT:    v_min_f32_e64 v3, v2, -v8
+; GFX940-NEXT:    v_cmp_o_f32_e64 vcc, v2, -v8
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v10, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v3f32__fneg_all:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v2, -v2, -v5, -v5
+; GFX950-NEXT:    v_minimum3_f32 v1, -v1, -v4, -v4
+; GFX950-NEXT:    v_minimum3_f32 v0, -v0, -v3, -v3
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, -v6, -v6
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, -v7, -v7
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, -v8, -v8
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %a.fneg = fneg <3 x float> %a
   %b.fneg = fneg <3 x float> %b
   %c.fneg = fneg <3 x float> %c
@@ -959,35 +1182,46 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm1(<3 x float> %a, <3 x float> %c
 ; GFX12-NEXT:    v_minimum3_f32 v2, v2, 2.0, v5
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v3f32__inlineimm1:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v6, 2.0, v2
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
-; GFX9-NEXT:    v_min_f32_e32 v6, 2.0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
-; GFX9-NEXT:    v_min_f32_e32 v6, 2.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v6, 2.0, v2
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX940-NEXT:    v_min_f32_e32 v6, 2.0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v6, vcc
+; GFX940-NEXT:    v_min_f32_e32 v6, 2.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX940-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v3f32__inlineimm1:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, 2.0, 2.0
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, 2.0, 2.0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, 2.0, 2.0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> <float 2.0, float 2.0, float 2.0>)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> %c)
   ret <3 x float> %max1
@@ -1006,35 +1240,46 @@ define <3 x float> @v_fminimum3_v3f32__inlineimm2(<3 x float> %a, <3 x float> %b
 ; GFX12-NEXT:    v_minimum3_f32 v2, v2, v5, 4.0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_fminimum3_v3f32__inlineimm2:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v6, v2, v5
-; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX9-NEXT:    v_min_f32_e32 v5, v1, v4
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX9-NEXT:    v_min_f32_e32 v4, v0, v3
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, 4.0, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, 4.0, v1
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v3, 4.0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v6, v2, v5
+; GFX940-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX940-NEXT:    v_min_f32_e32 v5, v1, v4
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v6, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX940-NEXT:    v_min_f32_e32 v4, v0, v3
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v4, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, 4.0, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v7, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, 4.0, v1
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v1, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v3, 4.0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v2, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_fminimum3_v3f32__inlineimm2:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, 4.0, 4.0
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, 4.0, 4.0
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, 4.0, 4.0
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   %max1 = call <3 x float> @llvm.minimum.v3f32(<3 x float> %max0, <3 x float> <float 4.0, float 4.0, float 4.0>)
   ret <3 x float> %max1
@@ -3165,19 +3410,26 @@ define <2 x float> @v_no_fminimum3_f32__multi_use(float %a, float %b, float %c)
 ; GFX12-NEXT:    v_minimum_f32 v1, v0, v2
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_no_fminimum3_f32__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v3, v0, v1
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, v0, v2
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX940-LABEL: v_no_fminimum3_f32__multi_use:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_min_f32_e32 v3, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v4, v3, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, v0, v2
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_no_fminimum3_f32__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    v_minimum3_f32 v1, v0, v2, v2
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   %insert.0 = insertelement <2 x float> poison, float %max0, i32 0
@@ -3193,22 +3445,31 @@ define amdgpu_ps <2 x i32> @s_no_fminimum3_f32__multi_use(float inreg %a, float
 ; GFX12-NEXT:    s_minimum_f32 s1, s0, s2
 ; GFX12-NEXT:    ; return to shader part epilog
 ;
-; GFX9-LABEL: s_no_fminimum3_f32__multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    v_mov_b32_e32 v0, s1
-; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX9-NEXT:    s_nop 1
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_min_f32_e32 v1, s2, v0
-; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    v_readfirstlane_b32 s1, v1
-; GFX9-NEXT:    ; return to shader part epilog
+; GFX940-LABEL: s_no_fminimum3_f32__multi_use:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    v_mov_b32_e32 v0, s1
+; GFX940-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX940-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX940-NEXT:    s_nop 1
+; GFX940-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX940-NEXT:    v_min_f32_e32 v1, s2, v0
+; GFX940-NEXT:    v_cmp_o_f32_e32 vcc, s2, v0
+; GFX940-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX940-NEXT:    ; return to shader part epilog
+;
+; GFX950-LABEL: s_no_fminimum3_f32__multi_use:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s1, s1
+; GFX950-NEXT:    v_minimum3_f32 v1, v0, s2, s2
+; GFX950-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX950-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX950-NEXT:    ; return to shader part epilog
   %max0 = call float @llvm.minimum.f32(float %a, float %b)
   %max1 = call float @llvm.minimum.f32(float %max0, float %c)
   %cast0 = bitcast float %max0 to i32
@@ -3372,6 +3633,3 @@ define <2 x double> @v_no_fminimum3_f64__multi_use(double %a, double %b, double
   %insert.1 = insertelement <2 x double> %insert.0, double %max1, i32 1
   ret <2 x double> %insert.1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX940: {{.*}}
-; GFX950: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
index c1fdfa2c4cf9ab..df7355c2c57bfa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f32.ll
@@ -39,11 +39,7 @@ define float @v_maximum_f32(float %src0, float %src1) {
 ; GFX950-LABEL: v_maximum_f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32:
@@ -89,11 +85,17 @@ define float @v_maximum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -151,11 +153,7 @@ define float @v_maximum_f32__nsz(float %src0, float %src1) {
 ; GFX950-LABEL: v_maximum_f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nsz:
@@ -201,11 +199,17 @@ define float @v_maximum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_max_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -267,11 +271,7 @@ define float @v_maximum_f32__nnan_src0(float %arg0, float %src1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_src0:
@@ -344,11 +344,7 @@ define float @v_maximum_f32__nnan_src1(float %src0, float %arg1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX950-NEXT:    v_max_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_f32__nnan_src1:
@@ -429,12 +425,8 @@ define void @s_maximum_f32(float inreg %src0, float inreg %src1) {
 ; GFX950-LABEL: s_maximum_f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_mov_b32_e32 v0, s1
-; GFX950-NEXT:    v_max_f32_e32 v1, s0, v0
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s1, s1
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use v0
 ; GFX950-NEXT:    ;;#ASMEND
@@ -521,15 +513,8 @@ define <2 x float> @v_maximum_v2f32(<2 x float> %src0, <2 x float> %src1) {
 ; GFX950-LABEL: v_maximum_v2f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX950-NEXT:    v_max_f32_e32 v2, v1, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32:
@@ -583,12 +568,19 @@ define <2 x float> @v_maximum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -657,15 +649,8 @@ define <2 x float> @v_maximum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
 ; GFX950-LABEL: v_maximum_v2f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v4, v0, v2
-; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX950-NEXT:    v_max_f32_e32 v2, v1, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nsz:
@@ -719,12 +704,19 @@ define <2 x float> @v_maximum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v2f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v2f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -808,16 +800,10 @@ define void @s_maximum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
 ; GFX950-LABEL: s_maximum_v2f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_mov_b32_e32 v0, s3
-; GFX950-NEXT:    v_max_f32_e32 v1, s1, v0
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
-; GFX950-NEXT:    v_mov_b32_e32 v0, s2
-; GFX950-NEXT:    v_max_f32_e32 v3, s0, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_maximum3_f32 v1, v0, s3, s3
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, s2, s2
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
@@ -920,19 +906,9 @@ define <3 x float> @v_maximum_v3f32(<3 x float> %src0, <3 x float> %src1) {
 ; GFX950-LABEL: v_maximum_v3f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v6, v0, v3
-; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX950-NEXT:    v_max_f32_e32 v3, v1, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX950-NEXT:    v_max_f32_e32 v3, v2, v5
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32:
@@ -995,13 +971,21 @@ define <3 x float> @v_maximum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1082,19 +1066,9 @@ define <3 x float> @v_maximum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
 ; GFX950-LABEL: v_maximum_v3f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v6, v0, v3
-; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX950-NEXT:    v_max_f32_e32 v3, v1, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX950-NEXT:    v_max_f32_e32 v3, v2, v5
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nsz:
@@ -1157,13 +1131,21 @@ define <3 x float> @v_maximum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v3f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v3f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1253,23 +1235,10 @@ define <4 x float> @v_maximum_v4f32(<4 x float> %src0, <4 x float> %src1) {
 ; GFX950-LABEL: v_maximum_v4f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v8, v0, v4
-; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX950-NEXT:    v_max_f32_e32 v4, v1, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX950-NEXT:    v_max_f32_e32 v4, v2, v6
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX950-NEXT:    v_max_f32_e32 v4, v3, v7
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32:
@@ -1341,14 +1310,23 @@ define <4 x float> @v_maximum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1440,23 +1418,10 @@ define <4 x float> @v_maximum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
 ; GFX950-LABEL: v_maximum_v4f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v8, v0, v4
-; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX950-NEXT:    v_max_f32_e32 v4, v1, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX950-NEXT:    v_max_f32_e32 v4, v2, v6
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX950-NEXT:    v_max_f32_e32 v4, v3, v7
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nsz:
@@ -1528,14 +1493,23 @@ define <4 x float> @v_maximum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_max_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_maximum_v4f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_max_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_max_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_max_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_max_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_max_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_max_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_max_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_max_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_maximum_v4f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_maximum3_f32 v3, v3, v7, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1663,39 +1637,14 @@ define <8 x float> @v_maximum_v8f32(<8 x float> %src0, <8 x float> %src1) {
 ; GFX950-LABEL: v_maximum_v8f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v16, v0, v8
-; GFX950-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
-; GFX950-NEXT:    v_max_f32_e32 v8, v1, v9
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
-; GFX950-NEXT:    v_max_f32_e32 v8, v2, v10
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
-; GFX950-NEXT:    v_max_f32_e32 v8, v3, v11
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
-; GFX950-NEXT:    v_max_f32_e32 v8, v4, v12
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
-; GFX950-NEXT:    v_max_f32_e32 v8, v5, v13
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
-; GFX950-NEXT:    v_max_f32_e32 v8, v6, v14
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
-; GFX950-NEXT:    v_max_f32_e32 v8, v7, v15
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v8, v8
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v9, v9
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v10, v10
+; GFX950-NEXT:    v_maximum3_f32 v3, v3, v11, v11
+; GFX950-NEXT:    v_maximum3_f32 v4, v4, v12, v12
+; GFX950-NEXT:    v_maximum3_f32 v5, v5, v13, v13
+; GFX950-NEXT:    v_maximum3_f32 v6, v6, v14, v14
+; GFX950-NEXT:    v_maximum3_f32 v7, v7, v15, v15
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v8f32:
@@ -1980,64 +1929,23 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
-; GFX950-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
-; GFX950-NEXT:    v_max_f32_e32 v33, v0, v16
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
-; GFX950-NEXT:    v_max_f32_e32 v34, v1, v17
-; GFX950-NEXT:    v_max_f32_e32 v35, v2, v18
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
-; GFX950-NEXT:    v_max_f32_e32 v36, v3, v19
-; GFX950-NEXT:    v_max_f32_e32 v37, v4, v20
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v32, v34, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
-; GFX950-NEXT:    v_max_f32_e32 v38, v5, v21
-; GFX950-NEXT:    v_max_f32_e32 v39, v6, v22
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v32, v35, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
-; GFX950-NEXT:    v_max_f32_e32 v48, v7, v23
-; GFX950-NEXT:    v_max_f32_e32 v49, v8, v24
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v32, v36, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
-; GFX950-NEXT:    v_max_f32_e32 v50, v9, v25
-; GFX950-NEXT:    v_max_f32_e32 v51, v10, v26
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v32, v37, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
-; GFX950-NEXT:    v_max_f32_e32 v52, v11, v27
-; GFX950-NEXT:    v_max_f32_e32 v53, v12, v28
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v32, v38, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
-; GFX950-NEXT:    v_max_f32_e32 v54, v13, v29
-; GFX950-NEXT:    v_max_f32_e32 v55, v14, v30
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v32, v39, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX950-NEXT:    v_maximum3_f32 v0, v0, v16, v16
+; GFX950-NEXT:    v_maximum3_f32 v1, v1, v17, v17
+; GFX950-NEXT:    v_maximum3_f32 v2, v2, v18, v18
+; GFX950-NEXT:    v_maximum3_f32 v3, v3, v19, v19
+; GFX950-NEXT:    v_maximum3_f32 v4, v4, v20, v20
+; GFX950-NEXT:    v_maximum3_f32 v5, v5, v21, v21
+; GFX950-NEXT:    v_maximum3_f32 v6, v6, v22, v22
+; GFX950-NEXT:    v_maximum3_f32 v7, v7, v23, v23
+; GFX950-NEXT:    v_maximum3_f32 v8, v8, v24, v24
+; GFX950-NEXT:    v_maximum3_f32 v9, v9, v25, v25
+; GFX950-NEXT:    v_maximum3_f32 v10, v10, v26, v26
+; GFX950-NEXT:    v_maximum3_f32 v11, v11, v27, v27
+; GFX950-NEXT:    v_maximum3_f32 v12, v12, v28, v28
+; GFX950-NEXT:    v_maximum3_f32 v13, v13, v29, v29
+; GFX950-NEXT:    v_maximum3_f32 v14, v14, v30, v30
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_max_f32_e32 v16, v15, v31
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v32, v48, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v32, v49, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v32, v50, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v32, v51, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v32, v52, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v32, v53, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v32, v54, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v32, v55, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc
+; GFX950-NEXT:    v_maximum3_f32 v15, v15, v31, v31
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_maximum_v16f32:
@@ -2176,3 +2084,4 @@ define <16 x float> @v_maximum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
index 2614fb3bf9f737..956de6de3aad3b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minimum.f32.ll
@@ -39,11 +39,7 @@ define float @v_minimum_f32(float %src0, float %src1) {
 ; GFX950-LABEL: v_minimum_f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32:
@@ -89,11 +85,17 @@ define float @v_minimum_f32__nnan(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -151,11 +153,7 @@ define float @v_minimum_f32__nsz(float %src0, float %src1) {
 ; GFX950-LABEL: v_minimum_f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nsz:
@@ -201,11 +199,17 @@ define float @v_minimum_f32__nnan_nsz(float %src0, float %src1) {
 ; GFX8-NEXT:    v_min_f32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -267,11 +271,7 @@ define float @v_minimum_f32__nnan_src0(float %arg0, float %src1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f32_e32 v0, 1.0, v0
-; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_src0:
@@ -344,11 +344,7 @@ define float @v_minimum_f32__nnan_src1(float %src0, float %arg1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    v_add_f32_e32 v1, 1.0, v1
-; GFX950-NEXT:    v_min_f32_e32 v2, v0, v1
-; GFX950-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v1, v1
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_f32__nnan_src1:
@@ -429,12 +425,8 @@ define void @s_minimum_f32(float inreg %src0, float inreg %src1) {
 ; GFX950-LABEL: s_minimum_f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_mov_b32_e32 v0, s1
-; GFX950-NEXT:    v_min_f32_e32 v1, s0, v0
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s1, s1
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use v0
 ; GFX950-NEXT:    ;;#ASMEND
@@ -521,15 +513,8 @@ define <2 x float> @v_minimum_v2f32(<2 x float> %src0, <2 x float> %src1) {
 ; GFX950-LABEL: v_minimum_v2f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX950-NEXT:    v_min_f32_e32 v2, v1, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32:
@@ -583,12 +568,19 @@ define <2 x float> @v_minimum_v2f32__nnan(<2 x float> %src0, <2 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -657,15 +649,8 @@ define <2 x float> @v_minimum_v2f32__nsz(<2 x float> %src0, <2 x float> %src1) {
 ; GFX950-LABEL: v_minimum_v2f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v4, v0, v2
-; GFX950-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
-; GFX950-NEXT:    v_min_f32_e32 v2, v1, v3
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nsz:
@@ -719,12 +704,19 @@ define <2 x float> @v_minimum_v2f32__nnan_nsz(<2 x float> %src0, <2 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v1, v1, v3
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v2f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v2
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v3
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v2f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v2, v2
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v3, v3
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v2f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -808,16 +800,10 @@ define void @s_minimum_v2f32(<2 x float> inreg %src0, <2 x float> inreg %src1) {
 ; GFX950-LABEL: s_minimum_v2f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_mov_b32_e32 v0, s3
-; GFX950-NEXT:    v_min_f32_e32 v1, s1, v0
-; GFX950-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s1, v0
-; GFX950-NEXT:    v_mov_b32_e32 v0, s2
-; GFX950-NEXT:    v_min_f32_e32 v3, s0, v0
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX950-NEXT:    v_mov_b32_e32 v0, s1
+; GFX950-NEXT:    v_minimum3_f32 v1, v0, s3, s3
+; GFX950-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, s2, s2
 ; GFX950-NEXT:    ;;#ASMSTART
 ; GFX950-NEXT:    ; use v[0:1]
 ; GFX950-NEXT:    ;;#ASMEND
@@ -920,19 +906,9 @@ define <3 x float> @v_minimum_v3f32(<3 x float> %src0, <3 x float> %src1) {
 ; GFX950-LABEL: v_minimum_v3f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v6, v0, v3
-; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX950-NEXT:    v_min_f32_e32 v3, v1, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX950-NEXT:    v_min_f32_e32 v3, v2, v5
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32:
@@ -995,13 +971,21 @@ define <3 x float> @v_minimum_v3f32__nnan(<3 x float> %src0, <3 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1082,19 +1066,9 @@ define <3 x float> @v_minimum_v3f32__nsz(<3 x float> %src0, <3 x float> %src1) {
 ; GFX950-LABEL: v_minimum_v3f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v6, v0, v3
-; GFX950-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
-; GFX950-NEXT:    v_min_f32_e32 v3, v1, v4
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
-; GFX950-NEXT:    v_min_f32_e32 v3, v2, v5
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nsz:
@@ -1157,13 +1131,21 @@ define <3 x float> @v_minimum_v3f32__nnan_nsz(<3 x float> %src0, <3 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v2, v2, v5
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v3f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v3
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v4
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v5
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v3
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v4
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v5
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v3f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v3, v3
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v5, v5
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v3f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1253,23 +1235,10 @@ define <4 x float> @v_minimum_v4f32(<4 x float> %src0, <4 x float> %src1) {
 ; GFX950-LABEL: v_minimum_v4f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v8, v0, v4
-; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX950-NEXT:    v_min_f32_e32 v4, v1, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX950-NEXT:    v_min_f32_e32 v4, v2, v6
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX950-NEXT:    v_min_f32_e32 v4, v3, v7
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32:
@@ -1341,14 +1310,23 @@ define <4 x float> @v_minimum_v4f32__nnan(<4 x float> %src0, <4 x float> %src1)
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f32__nnan:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f32__nnan:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f32__nnan:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan:
 ; GFX10:       ; %bb.0:
@@ -1440,23 +1418,10 @@ define <4 x float> @v_minimum_v4f32__nsz(<4 x float> %src0, <4 x float> %src1) {
 ; GFX950-LABEL: v_minimum_v4f32__nsz:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v8, v0, v4
-; GFX950-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
-; GFX950-NEXT:    v_min_f32_e32 v4, v1, v5
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
-; GFX950-NEXT:    v_min_f32_e32 v4, v2, v6
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
-; GFX950-NEXT:    v_min_f32_e32 v4, v3, v7
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nsz:
@@ -1528,14 +1493,23 @@ define <4 x float> @v_minimum_v4f32__nnan_nsz(<4 x float> %src0, <4 x float> %sr
 ; GFX8-NEXT:    v_min_f32_e32 v3, v3, v7
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX9-LABEL: v_minimum_v4f32__nnan_nsz:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_min_f32_e32 v0, v0, v4
-; GFX9-NEXT:    v_min_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_min_f32_e32 v2, v2, v6
-; GFX9-NEXT:    v_min_f32_e32 v3, v3, v7
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX900-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    v_min_f32_e32 v0, v0, v4
+; GFX900-NEXT:    v_min_f32_e32 v1, v1, v5
+; GFX900-NEXT:    v_min_f32_e32 v2, v2, v6
+; GFX900-NEXT:    v_min_f32_e32 v3, v3, v7
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX950-LABEL: v_minimum_v4f32__nnan_nsz:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v4, v4
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v5, v5
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v6, v6
+; GFX950-NEXT:    v_minimum3_f32 v3, v3, v7, v7
+; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v4f32__nnan_nsz:
 ; GFX10:       ; %bb.0:
@@ -1663,39 +1637,14 @@ define <8 x float> @v_minimum_v8f32(<8 x float> %src0, <8 x float> %src1) {
 ; GFX950-LABEL: v_minimum_v8f32:
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v16, v0, v8
-; GFX950-NEXT:    v_mov_b32_e32 v17, 0x7fc00000
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v8
-; GFX950-NEXT:    v_min_f32_e32 v8, v1, v9
-; GFX950-NEXT:    s_nop 0
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v17, v16, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v9
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v17, v8, vcc
-; GFX950-NEXT:    v_min_f32_e32 v8, v2, v10
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v10
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v17, v8, vcc
-; GFX950-NEXT:    v_min_f32_e32 v8, v3, v11
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v11
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v17, v8, vcc
-; GFX950-NEXT:    v_min_f32_e32 v8, v4, v12
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v12
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v17, v8, vcc
-; GFX950-NEXT:    v_min_f32_e32 v8, v5, v13
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v13
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v17, v8, vcc
-; GFX950-NEXT:    v_min_f32_e32 v8, v6, v14
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v14
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v17, v8, vcc
-; GFX950-NEXT:    v_min_f32_e32 v8, v7, v15
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v15
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v17, v8, vcc
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v8, v8
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v9, v9
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v10, v10
+; GFX950-NEXT:    v_minimum3_f32 v3, v3, v11, v11
+; GFX950-NEXT:    v_minimum3_f32 v4, v4, v12, v12
+; GFX950-NEXT:    v_minimum3_f32 v5, v5, v13, v13
+; GFX950-NEXT:    v_minimum3_f32 v6, v6, v14, v14
+; GFX950-NEXT:    v_minimum3_f32 v7, v7, v15, v15
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v8f32:
@@ -1980,64 +1929,23 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 ; GFX950:       ; %bb.0:
 ; GFX950-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX950-NEXT:    scratch_load_dword v31, off, s32
-; GFX950-NEXT:    v_mov_b32_e32 v32, 0x7fc00000
-; GFX950-NEXT:    v_min_f32_e32 v33, v0, v16
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v0, v16
-; GFX950-NEXT:    v_min_f32_e32 v34, v1, v17
-; GFX950-NEXT:    v_min_f32_e32 v35, v2, v18
-; GFX950-NEXT:    v_cndmask_b32_e32 v0, v32, v33, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
-; GFX950-NEXT:    v_min_f32_e32 v36, v3, v19
-; GFX950-NEXT:    v_min_f32_e32 v37, v4, v20
-; GFX950-NEXT:    v_cndmask_b32_e32 v1, v32, v34, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v2, v18
-; GFX950-NEXT:    v_min_f32_e32 v38, v5, v21
-; GFX950-NEXT:    v_min_f32_e32 v39, v6, v22
-; GFX950-NEXT:    v_cndmask_b32_e32 v2, v32, v35, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v3, v19
-; GFX950-NEXT:    v_min_f32_e32 v48, v7, v23
-; GFX950-NEXT:    v_min_f32_e32 v49, v8, v24
-; GFX950-NEXT:    v_cndmask_b32_e32 v3, v32, v36, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v4, v20
-; GFX950-NEXT:    v_min_f32_e32 v50, v9, v25
-; GFX950-NEXT:    v_min_f32_e32 v51, v10, v26
-; GFX950-NEXT:    v_cndmask_b32_e32 v4, v32, v37, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v5, v21
-; GFX950-NEXT:    v_min_f32_e32 v52, v11, v27
-; GFX950-NEXT:    v_min_f32_e32 v53, v12, v28
-; GFX950-NEXT:    v_cndmask_b32_e32 v5, v32, v38, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v6, v22
-; GFX950-NEXT:    v_min_f32_e32 v54, v13, v29
-; GFX950-NEXT:    v_min_f32_e32 v55, v14, v30
-; GFX950-NEXT:    v_cndmask_b32_e32 v6, v32, v39, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v7, v23
+; GFX950-NEXT:    v_minimum3_f32 v0, v0, v16, v16
+; GFX950-NEXT:    v_minimum3_f32 v1, v1, v17, v17
+; GFX950-NEXT:    v_minimum3_f32 v2, v2, v18, v18
+; GFX950-NEXT:    v_minimum3_f32 v3, v3, v19, v19
+; GFX950-NEXT:    v_minimum3_f32 v4, v4, v20, v20
+; GFX950-NEXT:    v_minimum3_f32 v5, v5, v21, v21
+; GFX950-NEXT:    v_minimum3_f32 v6, v6, v22, v22
+; GFX950-NEXT:    v_minimum3_f32 v7, v7, v23, v23
+; GFX950-NEXT:    v_minimum3_f32 v8, v8, v24, v24
+; GFX950-NEXT:    v_minimum3_f32 v9, v9, v25, v25
+; GFX950-NEXT:    v_minimum3_f32 v10, v10, v26, v26
+; GFX950-NEXT:    v_minimum3_f32 v11, v11, v27, v27
+; GFX950-NEXT:    v_minimum3_f32 v12, v12, v28, v28
+; GFX950-NEXT:    v_minimum3_f32 v13, v13, v29, v29
+; GFX950-NEXT:    v_minimum3_f32 v14, v14, v30, v30
 ; GFX950-NEXT:    s_waitcnt vmcnt(0)
-; GFX950-NEXT:    v_min_f32_e32 v16, v15, v31
-; GFX950-NEXT:    v_cndmask_b32_e32 v7, v32, v48, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v8, v32, v49, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v9, v32, v50, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v10, v32, v51, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v11, v32, v52, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v12, v32, v53, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v13, v32, v54, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v14, v32, v55, vcc
-; GFX950-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
-; GFX950-NEXT:    s_nop 1
-; GFX950-NEXT:    v_cndmask_b32_e32 v15, v32, v16, vcc
+; GFX950-NEXT:    v_minimum3_f32 v15, v15, v31, v31
 ; GFX950-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_minimum_v16f32:
@@ -2176,3 +2084,4 @@ define <16 x float> @v_minimum_v16f32(<16 x float> %src0, <16 x float> %src1) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN: {{.*}}
+; GFX9: {{.*}}